Buffer manager modifications to keep a local buffer-dirtied bit as well

as a shared dirtybit for each shared buffer.  The shared dirtybit still
controls writing the buffer, but the local bit controls whether we need
to fsync the buffer's file.  This arrangement fixes a bug that allowed
some required fsyncs to be missed, and should improve performance as well.
For more info see my post of same date on pghackers.
This commit is contained in:
Tom Lane 2000-04-09 04:43:20 +00:00
parent 9c38a8d296
commit 1f6d8b90b8
13 changed files with 955 additions and 435 deletions

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.62 2000/03/17 02:36:05 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.63 2000/04/09 04:43:16 tgl Exp $
*
* NOTES
* Transaction aborts can now occur two ways:
@ -642,7 +642,7 @@ RecordTransactionCommit()
{
FlushBufferPool();
if (leak)
ResetBufferPool();
ResetBufferPool(true);
/*
* have the transaction access methods record the status
@ -658,7 +658,7 @@ RecordTransactionCommit()
}
if (leak)
ResetBufferPool();
ResetBufferPool(true);
}
@ -759,7 +759,10 @@ RecordTransactionAbort()
if (SharedBufferChanged && !TransactionIdDidCommit(xid))
TransactionIdAbort(xid);
ResetBufferPool();
/*
* Tell bufmgr and smgr to release resources.
*/
ResetBufferPool(false); /* false -> is abort */
}
/* --------------------------------

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.30 2000/01/26 05:56:10 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/catalog/catalog.c,v 1.31 2000/04/09 04:43:15 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -23,25 +23,87 @@
#include "utils/syscache.h"
/*
* relpath - path to the relation
* Perhaps this should be in-line code in relopen().
* relpath - construct path to a relation's file
*
* Note that this only works with relations that are visible to the current
* backend, ie, either in the current database or shared system relations.
*
* Result is a palloc'd string.
*/
char *
relpath(const char *relname)
{
char *path;
size_t bufsize = 0;
if (IsSharedSystemRelationName(relname))
{
bufsize = strlen(DataDir) + sizeof(NameData) + 2;
/* Shared system relations live in DataDir */
size_t bufsize = strlen(DataDir) + sizeof(NameData) + 2;
path = (char *) palloc(bufsize);
snprintf(path, bufsize, "%s/%s", DataDir, relname);
snprintf(path, bufsize, "%s%c%s", DataDir, SEP_CHAR, relname);
return path;
}
/*
* If it is in the current database, assume it is in current working
* directory. NB: this does not work during bootstrap!
*/
return pstrdup(relname);
}
/*
* relpath_blind - construct path to a relation's file
*
* Construct the path using only the info available to smgrblindwrt,
* namely the names and OIDs of the database and relation. (Shared system
* relations are identified with dbid = 0.) Note that we may have to
* access a relation belonging to a different database!
*
* Result is a palloc'd string.
*/
char *
relpath_blind(const char *dbname, const char *relname,
Oid dbid, Oid relid)
{
char *path;
if (dbid == (Oid) 0)
{
/* Shared system relations live in DataDir */
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DataDir, SEP_CHAR, relname);
}
else if (dbid == MyDatabaseId)
{
/* XXX why is this inconsistent with relpath() ? */
path = (char *) palloc(strlen(DatabasePath) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relname);
}
else
{
/* this is work around only !!! */
char dbpathtmp[MAXPGPATH];
Oid id;
char *dbpath;
GetRawDatabaseInfo(dbname, &id, dbpathtmp);
if (id != dbid)
elog(FATAL, "relpath_blind: oid of db %s is not %u",
dbname, dbid);
dbpath = ExpandDatabasePath(dbpathtmp);
if (dbpath == NULL)
elog(FATAL, "relpath_blind: can't expand path for db %s",
dbname);
path = (char *) palloc(strlen(dbpath) + sizeof(NameData) + 2);
sprintf(path, "%s%c%s", dbpath, SEP_CHAR, relname);
pfree(dbpath);
}
return path;
}
/*
* IsSystemRelationName
* True iff name is the name of a system catalog relation.

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.32 2000/01/26 05:56:50 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.33 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -65,9 +65,11 @@ long *NWaitIOBackendP;
extern IpcSemaphoreId WaitIOSemId;
long *PrivateRefCount; /* also used in freelist.c */
bits8 *BufferLocks; /* */
long *CommitInfoNeedsSave;/* to write buffers where we have filled
* in t_infomask */
bits8 *BufferLocks; /* flag bits showing locks I have set */
BufferTag *BufferTagLastDirtied; /* tag buffer had when last dirtied by me */
BufferBlindId *BufferBlindLastDirtied; /* and its BlindId too */
bool *BufferDirtiedByMe; /* T if buf has been dirtied in cur xact */
/*
* Data Structures:
@ -247,7 +249,9 @@ InitBufferPool(IPCKey key)
#endif
PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8));
CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long));
BufferTagLastDirtied = (BufferTag *) calloc(NBuffers, sizeof(BufferTag));
BufferBlindLastDirtied = (BufferBlindId *) calloc(NBuffers, sizeof(BufferBlindId));
BufferDirtiedByMe = (bool *) calloc(NBuffers, sizeof(bool));
}
/* -----------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.20 2000/01/26 05:56:52 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.21 2000/04/09 04:43:19 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -122,7 +122,7 @@ PinBuffer_Debug(char *file, int line, BufferDesc *buf)
fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
@ -168,7 +168,7 @@ UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
refcount = %ld, file: %s, line: %d\n",
buffer, buf->sb_relname, buf->tag.blockNum,
buffer, buf->blind.relname, buf->tag.blockNum,
PrivateRefCount[buffer - 1], file, line);
}
}
@ -304,7 +304,7 @@ PrintBufferFreeList()
int i = (buf - BufferDescriptors);
printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
i, buf->sb_relname, buf->tag.blockNum,
i, buf->blind.relname, buf->tag.blockNum,
buf->flags, buf->refcount, PrivateRefCount[i],
buf->freeNext, buf->freePrev);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.54 2000/03/17 02:36:19 tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.55 2000/04/09 04:43:19 tgl Exp $
*
* NOTES:
*
@ -293,7 +293,7 @@ LruDelete(File file)
vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
Assert(vfdP->seekPos != -1);
/* if we have written to the file, sync it */
/* if we have written to the file, sync it before closing */
if (vfdP->fdstate & FD_DIRTY)
{
returnValue = pg_fsync(vfdP->fd);
@ -381,9 +381,6 @@ tryAgain:
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != -1);
}
/* Update state as appropriate for re-open (needed?) */
vfdP->fdstate &= ~FD_DIRTY;
}
/*
@ -804,7 +801,7 @@ FileWrite(File file, char *buffer, int amount)
if (returnCode > 0)
VfdCache[file].seekPos += returnCode;
/* record the write */
/* mark the file as needing fsync */
VfdCache[file].fdstate |= FD_DIRTY;
return returnCode;
@ -873,6 +870,35 @@ FileTruncate(File file, long offset)
return returnCode;
}
/*
* FileSync --- if a file is marked as dirty, fsync it.
*
* The FD_DIRTY bit is slightly misnamed: it doesn't mean that we need to
* write the file, but that we *have* written it and need to execute an
* fsync() to ensure the changes are down on disk before we mark the current
* transaction committed.
*
* FD_DIRTY is set by FileWrite or by an explicit FileMarkDirty() call.
* It is cleared after successfully fsync'ing the file. FileClose() will
* fsync a dirty File that is about to be closed, since there will be no
* other place to remember the need to fsync after the VFD is gone.
*
* Note that the DIRTY bit is logically associated with the actual disk file,
* not with any particular kernel FD we might have open for it. We assume
* that fsync will force out any dirty buffers for that file, whether or not
* they were written through the FD being used for the fsync call --- they
* might even have been written by some other backend!
*
* Note also that LruDelete currently fsyncs a dirty file that it is about
* to close the kernel file descriptor for. The idea there is to avoid
* having to re-open the kernel descriptor later. But it's not real clear
* that this is a performance win; we could end up fsyncing the same file
* multiple times in a transaction, which would probably cost more time
* than is saved by avoiding an open() call. This should be studied.
*
* This routine used to think it could skip the fsync if the file is
* physically closed, but that is now WRONG; see comments for FileMarkDirty.
*/
int
FileSync(File file)
{
@ -880,23 +906,66 @@ FileSync(File file)
Assert(FileIsValid(file));
/*
* If the file isn't open, then we don't need to sync it; we always
* sync files when we close them. Also, if we haven't done any writes
* that we haven't already synced, we can ignore the request.
*/
if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY))
returnCode = 0;
else
if (!(VfdCache[file].fdstate & FD_DIRTY))
{
returnCode = pg_fsync(VfdCache[file].fd);
/* Need not sync if file is not dirty. */
returnCode = 0;
}
else if (disableFsync)
{
/* Don't force the file open if pg_fsync isn't gonna sync it. */
returnCode = 0;
VfdCache[file].fdstate &= ~FD_DIRTY;
}
else
{
/* We don't use FileAccess() because we don't want to force the
* file to the front of the LRU ring; we aren't expecting to
* access it again soon.
*/
if (FileIsNotOpen(file))
{
returnCode = LruInsert(file);
if (returnCode != 0)
return returnCode;
}
returnCode = pg_fsync(VfdCache[file].fd);
if (returnCode == 0)
VfdCache[file].fdstate &= ~FD_DIRTY;
}
return returnCode;
}
/*
* FileMarkDirty --- mark a file as needing fsync at transaction commit.
*
* Since FileWrite marks the file dirty, this routine is not needed in
* normal use. It is called when the buffer manager detects that some other
* backend has written out a shared buffer that this backend dirtied (but
* didn't write) in the current xact. In that scenario, we need to fsync
* the file before we can commit. We cannot assume that the other backend
* has fsync'd the file yet; we need to do our own fsync to ensure that
* (a) the disk page is written and (b) this backend's commit is delayed
* until the write is complete.
*
* Note we are assuming that an fsync issued by this backend will write
* kernel disk buffers that were dirtied by another backend. Furthermore,
* it doesn't matter whether we currently have the file physically open;
* we must fsync even if we have to re-open the file to do it.
*/
void
FileMarkDirty(File file)
{
Assert(FileIsValid(file));
DO_DB(elog(DEBUG, "FileMarkDirty: %d (%s)",
file, VfdCache[file].fileName));
VfdCache[file].fdstate |= FD_DIRTY;
}
/*
* Routines that want to use stdio (ie, FILE*) should use AllocateFile
* rather than plain fopen(). This lets fd.c deal with freeing FDs if
@ -992,6 +1061,12 @@ closeAllVfds()
* exit (it doesn't particularly care which). All still-open temporary-file
* VFDs are closed, which also causes the underlying files to be deleted.
* Furthermore, all "allocated" stdio files are closed.
*
* This routine is not involved in fsync'ing non-temporary files at xact
* commit; that is done by FileSync under control of the buffer manager.
* During a commit, that is done *before* control gets here. If we still
* have any needs-fsync bits set when we get here, we assume this is abort
* and clear them.
*/
void
AtEOXact_Files(void)
@ -1006,6 +1081,8 @@ AtEOXact_Files(void)
if ((VfdCache[i].fdstate & FD_TEMPORARY) &&
VfdCache[i].fileName != NULL)
FileClose(i);
else
VfdCache[i].fdstate &= ~FD_DIRTY;
}
}

View File

@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.64 2000/02/07 02:38:18 inoue Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -48,7 +48,12 @@
typedef struct _MdfdVec
{
int mdfd_vfd; /* fd number in vfd pool */
uint16 mdfd_flags; /* clean, dirty, free */
int mdfd_flags; /* free, temporary */
/* these are the assigned bits in mdfd_flags: */
#define MDFD_FREE (1 << 0)/* unused entry */
#define MDFD_TEMP (1 << 1)/* close this entry at transaction end */
int mdfd_lstbcnt; /* most recent block count */
int mdfd_nextFree; /* next free vector */
#ifndef LET_OS_MANAGE_FILESIZE
@ -62,13 +67,13 @@ static int Md_Free = -1; /* head of freelist of unused fdvec entries */
static int CurFd = 0; /* first never-used fdvec index */
static MemoryContext MdCxt; /* context for all my allocations */
#define MDFD_DIRTY (uint16) 0x01
#define MDFD_FREE (uint16) 0x02
/* routines declared here */
static void mdclose_fd(int fd);
static int _mdfd_getrelnfd(Relation reln);
static MdfdVec *_mdfd_openseg(Relation reln, int segno, int oflags);
static MdfdVec *_mdfd_getseg(Relation reln, int blkno);
static MdfdVec *_mdfd_blind_getseg(char *dbname, char *relname,
Oid dbid, Oid relid, int blkno);
static int _fdvec_alloc(void);
static void _fdvec_free(int);
static BlockNumber _mdnblocks(File file, Size blcksz);
@ -186,6 +191,8 @@ mdcreate(Relation reln)
#endif
Md_fdvec[vfd].mdfd_lstbcnt = 0;
pfree(path);
return vfd;
}
@ -290,9 +297,6 @@ mdextend(Relation reln, char *buffer)
return SM_FAIL;
}
/* remember that we did a write, so we can sync at xact commit */
v->mdfd_flags |= MDFD_DIRTY;
/* try to keep the last block count current, though it's just a hint */
#ifndef LET_OS_MANAGE_FILESIZE
if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
@ -367,6 +371,8 @@ mdopen(Relation reln)
#endif
#endif
pfree(path);
return vfd;
}
@ -382,13 +388,24 @@ int
mdclose(Relation reln)
{
int fd;
MdfdVec *v;
MemoryContext oldcxt;
fd = RelationGetFile(reln);
if (fd < 0)
return SM_SUCCESS; /* already closed, so no work */
mdclose_fd(fd);
reln->rd_fd = -1;
return SM_SUCCESS;
}
static void
mdclose_fd(int fd)
{
MdfdVec *v;
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(MdCxt);
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL;)
@ -398,17 +415,14 @@ mdclose(Relation reln)
/* if not closed already */
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk.
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
/* mark this file descriptor as clean in our private table */
v->mdfd_flags &= ~MDFD_DIRTY;
}
/* Now free vector */
v = v->mdfd_chain;
@ -423,28 +437,20 @@ mdclose(Relation reln)
{
if (v->mdfd_vfd >= 0)
{
/*
* We sync the file descriptor so that we don't need to reopen
* it at transaction commit to force changes to disk.
* it at transaction commit to force changes to disk. (This
* is not really optional, because we are about to forget that
* the file even exists...)
*/
FileSync(v->mdfd_vfd);
FileClose(v->mdfd_vfd);
/* mark this file descriptor as clean in our private table */
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
#endif
MemoryContextSwitchTo(oldcxt);
_fdvec_free(fd);
/* be sure to mark relation closed */
reln->rd_fd = -1;
return SM_SUCCESS;
}
/*
@ -521,8 +527,6 @@ mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
v->mdfd_flags |= MDFD_DIRTY;
return status;
}
@ -560,14 +564,6 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
|| FileSync(v->mdfd_vfd) < 0)
status = SM_FAIL;
/*
* By here, the block is written and changes have been forced to
* stable storage. Mark the descriptor as clean until the next write,
* so we don't sync it again unnecessarily at transaction commit.
*/
v->mdfd_flags &= ~MDFD_DIRTY;
return status;
}
@ -575,141 +571,89 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
* mdblindwrt() -- Write a block to disk blind.
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. This
* is a synchronous write.
* the database and relation in which the block belongs. Otherwise
* this is just like mdwrite().
*/
int
mdblindwrt(char *dbstr,
char *relstr,
mdblindwrt(char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno,
char *buffer)
{
int fd;
int segno;
long seekpos;
int status;
char *path;
long seekpos;
MdfdVec *v;
#ifndef LET_OS_MANAGE_FILESIZE
int nchars;
v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
/* be sure we have enough space for the '.segno', if any */
segno = blkno / RELSEG_SIZE;
if (segno > 0)
nchars = 10;
else
nchars = 0;
/* construct the path to the file and open it */
/* system table? then put in system area... */
if (dbid == (Oid) 0)
{
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s/%s", DataDir, relstr);
else
sprintf(path, "%s/%s.%d", DataDir, relstr, segno);
}
/* user table? then put in user database area... */
else if (dbid == MyDatabaseId)
{
path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
else
sprintf(path, "%s%c%s.%d", DatabasePath, SEP_CHAR, relstr, segno);
}
else
/* this is work arround only !!! */
{
char dbpath[MAXPGPATH];
Oid id;
char *tmpPath;
GetRawDatabaseInfo(dbstr, &id, dbpath);
if (id != dbid)
elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
tmpPath = ExpandDatabasePath(dbpath);
if (tmpPath == NULL)
elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2 + nchars);
if (segno == 0)
sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
else
sprintf(path, "%s%c%s.%d", tmpPath, SEP_CHAR, relstr, segno);
pfree(tmpPath);
}
#else
/* construct the path to the file and open it */
/* system table? then put in system area... */
if (dbid == (Oid) 0)
{
path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2);
sprintf(path, "%s/%s", DataDir, relstr);
}
/* user table? then put in user database area... */
else if (dbid == MyDatabaseId)
{
path = (char *) palloc(strlen(DatabasePath) + 2 * sizeof(NameData) + 2);
sprintf(path, "%s%c%s", DatabasePath, SEP_CHAR, relstr);
}
else
/* this is work arround only !!! */
{
char dbpath[MAXPGPATH];
Oid id;
char *tmpPath;
GetRawDatabaseInfo(dbstr, &id, dbpath);
if (id != dbid)
elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
tmpPath = ExpandDatabasePath(dbpath);
if (tmpPath == NULL)
elog(FATAL, "mdblindwrt: can't expand path for db %s", dbstr);
path = (char *) palloc(strlen(tmpPath) + 2 * sizeof(NameData) + 2);
sprintf(path, "%s%c%s", tmpPath, SEP_CHAR, relstr);
pfree(tmpPath);
}
#endif
#ifndef __CYGWIN32__
if ((fd = open(path, O_RDWR, 0600)) < 0)
#else
if ((fd = open(path, O_RDWR | O_BINARY, 0600)) < 0)
#endif
if (v == NULL)
return SM_FAIL;
/* seek to the right spot */
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
#ifdef DIAGNOSTIC
if (seekpos >= BLCKSZ * RELSEG_SIZE)
elog(FATAL, "seekpos too big!");
#endif
#else
seekpos = (long) (BLCKSZ * (blkno));
#endif
if (lseek(fd, seekpos, SEEK_SET) != seekpos)
{
close(fd);
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
return SM_FAIL;
}
status = SM_SUCCESS;
/* write and sync the block */
if (write(fd, buffer, BLCKSZ) != BLCKSZ || (pg_fsync(fd) < 0))
if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
status = SM_FAIL;
if (close(fd) < 0)
status = SM_FAIL;
pfree(path);
return status;
}
/*
* mdmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* Returns SM_SUCCESS or SM_FAIL.
*/
int
mdmarkdirty(Relation reln, BlockNumber blkno)
{
MdfdVec *v;
v = _mdfd_getseg(reln, blkno);
FileMarkDirty(v->mdfd_vfd);
return SM_SUCCESS;
}
/*
* mdblindmarkdirty() -- Mark the specified block "dirty" (ie, needs fsync).
*
* We have to be able to do this using only the name and OID of
* the database and relation in which the block belongs. Otherwise
* this is just like mdmarkdirty().
*/
int
mdblindmarkdirty(char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno)
{
MdfdVec *v;
v = _mdfd_blind_getseg(dbname, relname, dbid, relid, blkno);
if (v == NULL)
return SM_FAIL;
FileMarkDirty(v->mdfd_vfd);
return SM_SUCCESS;
}
/*
* mdnblocks() -- Get the number of blocks stored in a relation.
*
@ -873,19 +817,26 @@ mdcommit()
for (i = 0; i < CurFd; i++)
{
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
#else
v = &Md_fdvec[i];
if (v != (MdfdVec *) NULL)
#endif
if (v->mdfd_flags & MDFD_FREE)
continue;
if (v->mdfd_flags & MDFD_TEMP)
{
if (v->mdfd_flags & MDFD_DIRTY)
/* Sync and close the file */
mdclose_fd(i);
}
else
{
/* Sync, but keep the file entry */
#ifndef LET_OS_MANAGE_FILESIZE
for ( ; v != (MdfdVec *) NULL; v = v->mdfd_chain)
#else
if (v != (MdfdVec *) NULL)
#endif
{
if (FileSync(v->mdfd_vfd) < 0)
return SM_FAIL;
v->mdfd_flags &= ~MDFD_DIRTY;
}
}
}
@ -908,13 +859,14 @@ mdabort()
for (i = 0; i < CurFd; i++)
{
#ifndef LET_OS_MANAGE_FILESIZE
for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain)
v->mdfd_flags &= ~MDFD_DIRTY;
#else
v = &Md_fdvec[i];
v->mdfd_flags &= ~MDFD_DIRTY;
#endif
if (v->mdfd_flags & MDFD_FREE)
continue;
if (v->mdfd_flags & MDFD_TEMP)
{
/* Close the file */
mdclose_fd(i);
}
}
return SM_SUCCESS;
@ -995,7 +947,6 @@ _fdvec_free(int fdvec)
Md_fdvec[fdvec].mdfd_nextFree = Md_Free;
Md_fdvec[fdvec].mdfd_flags = MDFD_FREE;
Md_Free = fdvec;
}
static MdfdVec *
@ -1004,19 +955,17 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
MemoryContext oldcxt;
MdfdVec *v;
int fd;
bool dofree;
char *path,
*fullpath;
/* be sure we have enough space for the '.segno', if any */
path = relpath(RelationGetPhysicalRelationName(reln));
dofree = false;
if (segno > 0)
{
dofree = true;
fullpath = (char *) palloc(strlen(path) + 12);
sprintf(fullpath, "%s.%d", path, segno);
pfree(path);
}
else
fullpath = path;
@ -1028,8 +977,7 @@ _mdfd_openseg(Relation reln, int segno, int oflags)
fd = FileNameOpenFile(fullpath, O_RDWR | O_BINARY | oflags, 0600);
#endif
if (dofree)
pfree(fullpath);
pfree(fullpath);
if (fd < 0)
return (MdfdVec *) NULL;
@ -1109,6 +1057,104 @@ _mdfd_getseg(Relation reln, int blkno)
return v;
}
/* Find the segment of the relation holding the specified block.
* This is the same as _mdfd_getseg() except that we must work
* "blind" with no Relation struct.
*
* NOTE: we have no easy way to tell whether a FD already exists for the
* target relation, so we always make a new one. This should probably
* be improved somehow, but I doubt it's a significant performance issue
* under normal circumstances. The FD is marked to be closed at end of xact
* so that we don't accumulate a lot of dead FDs.
*/
static MdfdVec *
_mdfd_blind_getseg(char *dbname, char *relname, Oid dbid, Oid relid,
int blkno)
{
MdfdVec *v;
char *path;
int fd;
int vfd;
#ifndef LET_OS_MANAGE_FILESIZE
int segno;
int targsegno;
#endif
/* construct the path to the file and open it */
path = relpath_blind(dbname, relname, dbid, relid);
#ifndef __CYGWIN32__
fd = FileNameOpenFile(path, O_RDWR, 0600);
#else
fd = FileNameOpenFile(path, O_RDWR | O_BINARY, 0600);
#endif
if (fd < 0)
return NULL;
vfd = _fdvec_alloc();
if (vfd < 0)
return NULL;
Md_fdvec[vfd].mdfd_vfd = fd;
Md_fdvec[vfd].mdfd_flags = MDFD_TEMP;
Md_fdvec[vfd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
#ifndef LET_OS_MANAGE_FILESIZE
Md_fdvec[vfd].mdfd_chain = (MdfdVec *) NULL;
#ifdef DIAGNOSTIC
if (Md_fdvec[vfd].mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on relopen!");
#endif
targsegno = blkno / RELSEG_SIZE;
for (v = &Md_fdvec[vfd], segno = 1; segno <= targsegno; segno++)
{
char *segpath;
MdfdVec *newv;
MemoryContext oldcxt;
segpath = (char *) palloc(strlen(path) + 12);
sprintf(segpath, "%s.%d", path, segno);
#ifndef __CYGWIN32__
fd = FileNameOpenFile(segpath, O_RDWR | O_CREAT, 0600);
#else
fd = FileNameOpenFile(segpath, O_RDWR | O_BINARY | O_CREAT, 0600);
#endif
pfree(segpath);
if (fd < 0)
return (MdfdVec *) NULL;
/* allocate an mdfdvec entry for it */
oldcxt = MemoryContextSwitchTo(MdCxt);
newv = (MdfdVec *) palloc(sizeof(MdfdVec));
MemoryContextSwitchTo(oldcxt);
/* fill the entry */
newv->mdfd_vfd = fd;
newv->mdfd_flags = MDFD_TEMP;
newv->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
newv->mdfd_chain = (MdfdVec *) NULL;
#ifdef DIAGNOSTIC
if (newv->mdfd_lstbcnt > RELSEG_SIZE)
elog(FATAL, "segment too big on open!");
#endif
v->mdfd_chain = newv;
v = newv;
}
#else
v = &Md_fdvec[vfd];
#endif
pfree(path);
return v;
}
static BlockNumber
_mdnblocks(File file, Size blcksz)
{

View File

@ -11,7 +11,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.32 2000/01/26 05:57:05 momjian Exp $
* $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.33 2000/04/09 04:43:20 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -23,21 +23,30 @@ static void smgrshutdown(int dummy);
typedef struct f_smgr
{
int (*smgr_init) ();/* may be NULL */
int (*smgr_shutdown) (); /* may be NULL */
int (*smgr_create) ();
int (*smgr_unlink) ();
int (*smgr_extend) ();
int (*smgr_open) ();
int (*smgr_close) ();
int (*smgr_read) ();
int (*smgr_write) ();
int (*smgr_flush) ();
int (*smgr_blindwrt) ();
int (*smgr_nblocks) ();
int (*smgr_truncate) ();
int (*smgr_commit) (); /* may be NULL */
int (*smgr_abort) (); /* may be NULL */
int (*smgr_init) (void); /* may be NULL */
int (*smgr_shutdown) (void); /* may be NULL */
int (*smgr_create) (Relation reln);
int (*smgr_unlink) (Relation reln);
int (*smgr_extend) (Relation reln, char *buffer);
int (*smgr_open) (Relation reln);
int (*smgr_close) (Relation reln);
int (*smgr_read) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_write) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_flush) (Relation reln, BlockNumber blocknum,
char *buffer);
int (*smgr_blindwrt) (char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
int (*smgr_markdirty) (Relation reln, BlockNumber blkno);
int (*smgr_blindmarkdirty) (char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno);
int (*smgr_nblocks) (Relation reln);
int (*smgr_truncate) (Relation reln, int nblocks);
int (*smgr_commit) (void); /* may be NULL */
int (*smgr_abort) (void); /* may be NULL */
} f_smgr;
/*
@ -49,14 +58,14 @@ static f_smgr smgrsw[] = {
/* magnetic disk */
{mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdtruncate,
mdcommit, mdabort},
mdread, mdwrite, mdflush, mdblindwrt, mdmarkdirty, mdblindmarkdirty,
mdnblocks, mdtruncate, mdcommit, mdabort},
#ifdef STABLE_MEMORY_STORAGE
/* main memory */
{mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, NULL,
mmcommit, mmabort},
mmread, mmwrite, mmflush, mmblindwrt, mmmarkdirty, mmblindmarkdirty,
mmnblocks, NULL, mmcommit, mmabort},
#endif
};
@ -299,6 +308,7 @@ smgrblindwrt(int16 which,
char *relstr;
int status;
/* strdup here is probably redundant */
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
@ -315,6 +325,67 @@ smgrblindwrt(int16 which,
return status;
}
/*
* smgrmarkdirty() -- Mark a page dirty (needs fsync).
*
* Mark the specified page as needing to be fsync'd before commit.
* Ordinarily, the storage manager will do this implicitly during
* smgrwrite(). However, the buffer manager may discover that some
* other backend has written a buffer that we dirtied in the current
* transaction. In that case, we still need to fsync the file to be
* sure the page is down to disk before we commit.
*/
int
smgrmarkdirty(int16 which,
Relation reln,
BlockNumber blkno)
{
int status;
status = (*(smgrsw[which].smgr_markdirty)) (reln, blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s",
blkno, RelationGetRelationName(reln));
return status;
}
/*
* smgrblindmarkdirty() -- Mark a page dirty, "blind".
*
* Just like smgrmarkdirty, except we don't have a reldesc.
*/
int
smgrblindmarkdirty(int16 which,
char *dbname,
char *relname,
Oid dbid,
Oid relid,
BlockNumber blkno)
{
char *dbstr;
char *relstr;
int status;
/* strdup here is probably redundant */
dbstr = pstrdup(dbname);
relstr = pstrdup(relname);
status = (*(smgrsw[which].smgr_blindmarkdirty)) (dbstr, relstr,
dbid, relid,
blkno);
if (status == SM_FAIL)
elog(ERROR, "cannot mark block %d of %s [%s] blind",
blkno, relstr, dbstr);
pfree(dbstr);
pfree(relstr);
return status;
}
/*
* smgrnblocks() -- Calculate the number of POSTGRES blocks in the
* supplied relation.
@ -378,7 +449,6 @@ smgrcommit()
return SM_SUCCESS;
}
#ifdef NOT_USED
int
smgrabort()
{
@ -396,8 +466,6 @@ smgrabort()
return SM_SUCCESS;
}
#endif
#ifdef NOT_USED
bool
smgriswo(int16 smgrno)

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: catalog.h,v 1.10 2000/01/26 05:57:56 momjian Exp $
* $Id: catalog.h,v 1.11 2000/04/09 04:43:14 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -17,6 +17,8 @@
#include "access/tupdesc.h"
extern char *relpath(const char *relname);
extern char *relpath_blind(const char *dbname, const char *relname,
Oid dbid, Oid relid);
extern bool IsSystemRelationName(const char *relname);
extern bool IsSharedSystemRelationName(const char *relname);
extern Oid newoid(void);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: buf_internals.h,v 1.35 2000/01/26 05:58:32 momjian Exp $
* $Id: buf_internals.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -61,6 +61,16 @@ typedef struct buftag
(a)->relId = (xx_reln)->rd_lockInfo.lockRelId \
)
/* If we have to write a buffer "blind" (without a relcache entry),
* the BufferTag is not enough information. BufferBlindId carries the
* additional information needed.
*/
typedef struct bufblindid
{
char dbname[NAMEDATALEN]; /* name of db in which buf belongs */
char relname[NAMEDATALEN]; /* name of reln */
} BufferBlindId;
#define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers)
#define INVALID_DESCRIPTOR (-3)
@ -98,8 +108,7 @@ typedef struct sbufdesc
bool ri_lock; /* read-intent lock */
bool w_lock; /* context exclusively locked */
char sb_dbname[NAMEDATALEN]; /* name of db in which buf belongs */
char sb_relname[NAMEDATALEN]; /* name of reln */
BufferBlindId blind; /* extra info to support blind write */
} BufferDesc;
/*
@ -164,7 +173,9 @@ extern BufferDesc *BufferDescriptors;
extern BufferBlock BufferBlocks;
extern long *PrivateRefCount;
extern bits8 *BufferLocks;
extern long *CommitInfoNeedsSave;
extern BufferTag *BufferTagLastDirtied;
extern BufferBlindId *BufferBlindLastDirtied;
extern bool *BufferDirtiedByMe;
extern SPINLOCK BufMgrLock;
/* localbuf.c */

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: bufmgr.h,v 1.35 2000/03/31 02:43:30 tgl Exp $
* $Id: bufmgr.h,v 1.36 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -164,7 +164,7 @@ extern int FlushBuffer(Buffer buffer, bool release);
extern void InitBufferPool(IPCKey key);
extern void PrintBufferUsage(FILE *statfp);
extern void ResetBufferUsage(void);
extern void ResetBufferPool(void);
extern void ResetBufferPool(bool isCommit);
extern int BufferPoolCheckLeak(void);
extern void FlushBufferPool(void);
extern BlockNumber BufferGetBlockNumber(Buffer buffer);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: fd.h,v 1.19 2000/01/26 05:58:32 momjian Exp $
* $Id: fd.h,v 1.20 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -15,7 +15,7 @@
/*
* calls:
*
* File {Close, Read, Write, Seek, Tell, Sync}
* File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
* {File Name Open, Allocate, Free} File
*
* These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@ -58,6 +58,7 @@ extern int FileWrite(File file, char *buffer, int amount);
extern long FileSeek(File file, long offset, int whence);
extern int FileTruncate(File file, long offset);
extern int FileSync(File file);
extern void FileMarkDirty(File file);
/* Operations that allow use of regular stdio --- USE WITH CAUTION */
extern FILE *AllocateFile(char *name, char *mode);

View File

@ -7,7 +7,7 @@
* Portions Copyright (c) 1996-2000, PostgreSQL, Inc
* Portions Copyright (c) 1994, Regents of the University of California
*
* $Id: smgr.h,v 1.17 2000/01/26 05:58:33 momjian Exp $
* $Id: smgr.h,v 1.18 2000/04/09 04:43:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -29,17 +29,23 @@ extern int smgrunlink(int16 which, Relation reln);
extern int smgrextend(int16 which, Relation reln, char *buffer);
extern int smgropen(int16 which, Relation reln);
extern int smgrclose(int16 which, Relation reln);
extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
Oid relid, BlockNumber blkno, char *buffer);
extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
char *buffer);
extern int smgrblindwrt(int16 which, char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int smgrmarkdirty(int16 which, Relation reln, BlockNumber blkno);
extern int smgrblindmarkdirty(int16 which, char *dbname, char *relname,
Oid dbid, Oid relid,
BlockNumber blkno);
extern int smgrnblocks(int16 which, Relation reln);
extern int smgrtruncate(int16 which, Relation reln, int nblocks);
extern int smgrcommit(void);
extern int smgrabort(void);
@ -55,8 +61,11 @@ extern int mdclose(Relation reln);
extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mdblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mdmarkdirty(Relation reln, BlockNumber blkno);
extern int mdblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno);
extern int mdnblocks(Relation reln);
extern int mdtruncate(Relation reln, int nblocks);
extern int mdcommit(void);
@ -66,7 +75,6 @@ extern int mdabort(void);
extern SPINLOCK MMCacheLock;
extern int mminit(void);
extern int mmshutdown(void);
extern int mmcreate(Relation reln);
extern int mmunlink(Relation reln);
extern int mmextend(Relation reln, char *buffer);
@ -75,11 +83,17 @@ extern int mmclose(Relation reln);
extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mmblindwrt(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno, char *buffer);
extern int mmmarkdirty(Relation reln, BlockNumber blkno);
extern int mmblindmarkdirty(char *dbname, char *relname, Oid dbid, Oid relid,
BlockNumber blkno);
extern int mmnblocks(Relation reln);
extern int mmtruncate(Relation reln, int nblocks);
extern int mmcommit(void);
extern int mmabort(void);
extern int mmshutdown(void);
extern int MMShmemSize(void);
/* smgrtype.c */