@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $ Header : / cvsroot / pgsql / src / backend / storage / buffer / bufmgr . c , v 1.77 2000 / 03 / 31 02 : 43 : 3 1 tgl Exp $
* $ Header : / cvsroot / pgsql / src / backend / storage / buffer / bufmgr . c , v 1.78 2000 / 04 / 09 04 : 43 : 18 tgl Exp $
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
@ -94,8 +94,10 @@ static Buffer ReadBufferWithBufferLock(Relation relation, BlockNumber blockNum,
bool bufferLockHeld ) ;
static BufferDesc * BufferAlloc ( Relation reln , BlockNumber blockNum ,
bool * foundPtr , bool bufferLockHeld ) ;
static void SetBufferDirtiedByMe ( Buffer buffer , BufferDesc * bufHdr ) ;
static void ClearBufferDirtiedByMe ( Buffer buffer , BufferDesc * bufHdr ) ;
static void BufferSync ( void ) ;
static int BufferReplace ( BufferDesc * bufHdr , bool bufferLockHeld ) ;
static int BufferReplace ( BufferDesc * bufHdr ) ;
void PrintBufferDescs ( void ) ;
/* ---------------------------------------------------
@ -176,7 +178,7 @@ is_userbuffer(Buffer buffer)
{
BufferDesc * buf = & BufferDescriptors [ buffer - 1 ] ;
if ( IsSystemRelationName ( buf - > sb_ relname) )
if ( IsSystemRelationName ( buf - > blind . relname ) )
return false ;
return true ;
}
@ -199,7 +201,7 @@ ReadBuffer_Debug(char *file,
fprintf ( stderr , " PIN(RD) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
buffer , buf - > sb_ relname, buf - > tag . blockNum ,
buffer , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ buffer - 1 ] , file , line ) ;
}
return buffer ;
@ -390,7 +392,6 @@ BufferAlloc(Relation reln,
* If there ' s no IO for the buffer and the buffer
* is BROKEN , it should be read again . So start a
* new buffer IO here .
*
* wierd race condition :
*
@ -465,33 +466,24 @@ BufferAlloc(Relation reln,
* in WaitIO until we ' re done .
*/
inProgress = TRUE ;
# ifdef HAS_TEST_AND_SET
/*
* All code paths that acquire this lock pin the buffer first ;
* since no one had it pinned ( it just came off the free
* list ) , no one else can have this lock .
*/
# endif /* HAS_TEST_AND_SET */
StartBufferIO ( buf , false ) ;
/*
* Write the buffer out , being careful to release BufMgrLock
* before starting the I / O .
*
* This # ifndef is here because a few extra semops REALLY kill
* you on machines that don ' t have spinlocks . If you don ' t
* operate with much concurrency , well . . .
*/
smok = BufferReplace ( buf , true ) ;
# ifndef OPTIMIZE_SINGLE
SpinAcquire ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
smok = BufferReplace ( buf ) ;
if ( smok = = FALSE )
{
elog ( NOTICE , " BufferAlloc: cannot write block %u for %s/%s " ,
buf - > tag . blockNum , buf - > sb_ dbname, buf - > sb_ relname) ;
buf - > tag . blockNum , buf - > blind . dbname , buf - > blind . relname ) ;
inProgress = FALSE ;
buf - > flags | = BM_IO_ERROR ;
buf - > flags & = ~ BM_IO_IN_PROGRESS ;
@ -516,7 +508,7 @@ BufferAlloc(Relation reln,
if ( buf - > flags & BM_JUST_DIRTIED )
{
elog ( FATAL , " BufferAlloc: content of block %u (%s) changed while flushing " ,
buf - > tag . blockNum , buf - > sb_ relname) ;
buf - > tag . blockNum , buf - > blind . relname ) ;
}
else
buf - > flags & = ~ BM_DIRTY ;
@ -562,6 +554,7 @@ BufferAlloc(Relation reln,
*/
if ( buf ! = NULL )
{
buf - > flags & = ~ BM_IO_IN_PROGRESS ;
TerminateBufferIO ( buf ) ;
/* give up the buffer since we don't need it any more */
PrivateRefCount [ BufferDescriptorGetBuffer ( buf ) - 1 ] = 0 ;
@ -572,7 +565,6 @@ BufferAlloc(Relation reln,
AddBufferToFreelist ( buf ) ;
buf - > flags | = BM_FREE ;
}
buf - > flags & = ~ BM_IO_IN_PROGRESS ;
}
PinBuffer ( buf2 ) ;
@ -619,8 +611,8 @@ BufferAlloc(Relation reln,
}
/* record the database name and relation name for this buffer */
strcpy ( buf - > sb_relname , RelationGetPhysicalRelationName ( reln ) ) ;
strcpy ( buf - > sb_dbname , DatabaseName ) ;
strcpy ( buf - > blind . dbname , DatabaseName ) ;
strcpy ( buf - > blind . relname , RelationGetPhysicalRelationName ( reln ) ) ;
INIT_BUFFERTAG ( & ( buf - > tag ) , reln , blockNum ) ;
if ( ! BufTableInsert ( buf ) )
@ -683,9 +675,9 @@ WriteBuffer(Buffer buffer)
SpinAcquire ( BufMgrLock ) ;
Assert ( bufHdr - > refcount > 0 ) ;
bufHdr - > flags | = ( BM_DIRTY | BM_JUST_DIRTIED ) ;
SetBufferDirtiedByMe ( buffer , bufHdr ) ;
UnpinBuffer ( bufHdr ) ;
SpinRelease ( BufMgrLock ) ;
CommitInfoNeedsSave [ buffer - 1 ] = 0 ;
return TRUE ;
}
@ -702,7 +694,7 @@ WriteBuffer_Debug(char *file, int line, Buffer buffer)
buf = & BufferDescriptors [ buffer - 1 ] ;
fprintf ( stderr , " UNPIN(WR) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
buffer , buf - > sb_ relname, buf - > tag . blockNum ,
buffer , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ buffer - 1 ] , file , line ) ;
}
}
@ -767,8 +759,9 @@ DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno, char *dest)
*
* ' buffer ' is known to be dirty / pinned , so there should not be a
* problem reading the BufferDesc members without the BufMgrLock
* ( nobody should be able to change tags , flags , etc . out from under
* us ) . Unpin if ' release ' is TRUE .
* ( nobody should be able to change tags out from under us ) .
*
* Unpin if ' release ' is TRUE .
*/
int
FlushBuffer ( Buffer buffer , bool release )
@ -784,6 +777,8 @@ FlushBuffer(Buffer buffer, bool release)
if ( BAD_BUFFER_ID ( buffer ) )
return STATUS_ERROR ;
Assert ( PrivateRefCount [ buffer - 1 ] > 0 ) ; /* else caller didn't pin */
bufHdr = & BufferDescriptors [ buffer - 1 ] ;
bufdb = bufHdr - > tag . relId . dbId ;
@ -809,7 +804,7 @@ FlushBuffer(Buffer buffer, bool release)
if ( status = = SM_FAIL )
{
elog ( ERROR , " FlushBuffer: cannot flush block %u of the relation %s " ,
bufHdr - > tag . blockNum , bufHdr - > sb_ relname) ;
bufHdr - > tag . blockNum , bufHdr - > blind . relname ) ;
return STATUS_ERROR ;
}
BufferFlushCount + + ;
@ -820,19 +815,21 @@ FlushBuffer(Buffer buffer, bool release)
/*
* If this buffer was marked by someone as DIRTY while we were
* flushing it out we must not clear DIRTY flag - vadim 01 / 17 / 97
* flushing it out we must not clear shared DIRTY flag - vadim 01 / 17 / 97
*
* . . . but we can clear BufferDirtiedByMe anyway - tgl 3 / 31 / 00
*/
if ( bufHdr - > flags & BM_JUST_DIRTIED )
{
elog ( NOTICE , " FlushBuffer: content of block %u (%s) changed while flushing " ,
bufHdr - > tag . blockNum , bufHdr - > sb_ relname) ;
bufHdr - > tag . blockNum , bufHdr - > blind . relname ) ;
}
else
bufHdr - > flags & = ~ BM_DIRTY ;
ClearBufferDirtiedByMe ( buffer , bufHdr ) ;
if ( release )
UnpinBuffer ( bufHdr ) ;
SpinRelease ( BufMgrLock ) ;
CommitInfoNeedsSave [ buffer - 1 ] = 0 ;
return STATUS_OK ;
}
@ -857,9 +854,10 @@ WriteNoReleaseBuffer(Buffer buffer)
SharedBufferChanged = true ;
SpinAcquire ( BufMgrLock ) ;
Assert ( bufHdr - > refcount > 0 ) ;
bufHdr - > flags | = ( BM_DIRTY | BM_JUST_DIRTIED ) ;
SetBufferDirtiedByMe ( buffer , bufHdr ) ;
SpinRelease ( BufMgrLock ) ;
CommitInfoNeedsSave [ buffer - 1 ] = 0 ;
return STATUS_OK ;
}
@ -901,11 +899,6 @@ ReleaseAndReadBuffer(Buffer buffer,
AddBufferToFreelist ( bufHdr ) ;
bufHdr - > flags | = BM_FREE ;
}
if ( CommitInfoNeedsSave [ buffer - 1 ] )
{
bufHdr - > flags | = ( BM_DIRTY | BM_JUST_DIRTIED ) ;
CommitInfoNeedsSave [ buffer - 1 ] = 0 ;
}
retbuf = ReadBufferWithBufferLock ( relation , blockNum , true ) ;
return retbuf ;
}
@ -915,13 +908,120 @@ ReleaseAndReadBuffer(Buffer buffer,
return ReadBuffer ( relation , blockNum ) ;
}
/*
* SetBufferDirtiedByMe - - mark a shared buffer as being dirtied by this xact
*
* This flag essentially remembers that we need to write and fsync this buffer
* before we can commit the transaction . The write might end up getting done
* by another backend , but we must do the fsync ourselves ( else we could
* commit before the data actually reaches disk ) . We do not issue fsync
* instantly upon write ; the storage manager keeps track of which files need
* to be fsync ' d before commit can occur . A key aspect of this data structure
* is that we will be able to notify the storage manager that an fsync is
* needed even after another backend has done the physical write and replaced
* the buffer contents with something else !
*
* NB : we must be holding the bufmgr lock at entry , and the buffer must be
* pinned so that no other backend can take it away from us .
*/
static void
SetBufferDirtiedByMe ( Buffer buffer , BufferDesc * bufHdr )
{
BufferTag * tagLastDirtied = & BufferTagLastDirtied [ buffer - 1 ] ;
Relation reln ;
int status ;
/*
* If the flag is already set , check to see whether the buffertag is
* the same . If not , some other backend already wrote the buffer data
* that we dirtied . We must tell the storage manager to make an fsync
* pending on that file before we can overwrite the old tag value .
*/
if ( BufferDirtiedByMe [ buffer - 1 ] )
{
if ( bufHdr - > tag . relId . dbId = = tagLastDirtied - > relId . dbId & &
bufHdr - > tag . relId . relId = = tagLastDirtied - > relId . relId & &
bufHdr - > tag . blockNum = = tagLastDirtied - > blockNum )
return ; /* Same tag already dirtied, so no work */
# ifndef OPTIMIZE_SINGLE
SpinRelease ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
reln = RelationIdCacheGetRelation ( tagLastDirtied - > relId . relId ) ;
if ( reln = = ( Relation ) NULL )
{
status = smgrblindmarkdirty ( DEFAULT_SMGR ,
BufferBlindLastDirtied [ buffer - 1 ] . dbname ,
BufferBlindLastDirtied [ buffer - 1 ] . relname ,
tagLastDirtied - > relId . dbId ,
tagLastDirtied - > relId . relId ,
tagLastDirtied - > blockNum ) ;
}
else
{
status = smgrmarkdirty ( DEFAULT_SMGR , reln ,
tagLastDirtied - > blockNum ) ;
/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
RelationDecrementReferenceCount ( reln ) ;
}
if ( status = = SM_FAIL )
{
elog ( ERROR , " SetBufferDirtiedByMe: cannot mark %u for %s " ,
tagLastDirtied - > blockNum ,
BufferBlindLastDirtied [ buffer - 1 ] . relname ) ;
}
# ifndef OPTIMIZE_SINGLE
SpinAcquire ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
}
* tagLastDirtied = bufHdr - > tag ;
BufferBlindLastDirtied [ buffer - 1 ] = bufHdr - > blind ;
BufferDirtiedByMe [ buffer - 1 ] = true ;
}
/*
* ClearBufferDirtiedByMe - - mark a shared buffer as no longer needing fsync
*
* If we write out a buffer ourselves , then the storage manager will set its
* needs - fsync flag for that file automatically , and so we can clear our own
* flag that says it needs to be done later .
*
* NB : we must be holding the bufmgr lock at entry .
*/
static void
ClearBufferDirtiedByMe ( Buffer buffer , BufferDesc * bufHdr )
{
BufferTag * tagLastDirtied = & BufferTagLastDirtied [ buffer - 1 ] ;
/*
* Do * not * clear the flag if it refers to some other buffertag than
* the data we just wrote . This is unlikely , but possible if some
* other backend replaced the buffer contents since we set our flag .
*/
if ( bufHdr - > tag . relId . dbId = = tagLastDirtied - > relId . dbId & &
bufHdr - > tag . relId . relId = = tagLastDirtied - > relId . relId & &
bufHdr - > tag . blockNum = = tagLastDirtied - > blockNum )
{
BufferDirtiedByMe [ buffer - 1 ] = false ;
}
}
/*
* BufferSync - - Flush all dirty buffers in the pool .
*
* This is called at transaction commit time . It does the wrong thing ,
* right now . We should flush only our own changes to stable storage ,
* and we should obey the lock protocol on the buffer manager metadata
* as we do it . Also , we need to be sure that no other transaction is
* This is called at transaction commit time . We find all buffers
* that have been dirtied by the current xact and flush them to disk .
* We do * not * flush dirty buffers that have been dirtied by other xacts .
* ( This is a substantial change from pre - 7.0 behavior . )
*
* OLD COMMENTS ( do these still apply ? )
*
* Also , we need to be sure that no other transaction is
* modifying the page as we flush it . This is only a problem for objects
* that use a non - two - phase locking protocol , like btree indices . For
* those objects , we would like to set a write lock for the duration of
@ -936,21 +1036,49 @@ static void
BufferSync ( )
{
int i ;
Oid bufdb ;
Oid bufrel ;
Relation reln ;
BufferDesc * bufHdr ;
int status ;
Relation reln ;
bool didwrite ;
SpinAcquire ( BufMgrLock ) ;
for ( i = 0 , bufHdr = BufferDescriptors ; i < NBuffers ; i + + , bufHdr + + )
{
/* Ignore buffers that were not dirtied by me */
if ( ! BufferDirtiedByMe [ i ] )
continue ;
SpinAcquire ( BufMgrLock ) ;
/*
* We only need to write if the buffer is still dirty and still
* contains the same disk page that it contained when we dirtied it .
* Otherwise , someone else has already written our changes for us ,
* and we need only fsync .
*
* ( NOTE : it ' s still possible to do an unnecessary write , if other
* xacts have written and then re - dirtied the page since our last
* change to it . But that should be pretty uncommon , and there ' s
* no easy way to detect it anyway . )
*/
reln = NULL ;
didwrite = false ;
if ( ( bufHdr - > flags & BM_VALID ) & & ( bufHdr - > flags & BM_DIRTY ) )
{
Oid bufdb ;
Oid bufrel ;
bufdb = bufHdr - > tag . relId . dbId ;
bufrel = bufHdr - > tag . relId . relId ;
if ( bufdb = = MyDatabaseId | | bufdb = = ( Oid ) 0 )
if ( bufdb = = BufferTagLastDirtied [ i ] . relId . dbId & &
bufrel = = BufferTagLastDirtied [ i ] . relId . relId & &
bufHdr - > tag . blockNum = = BufferTagLastDirtied [ i ] . blockNum )
{
/*
* Try to find relation for buf . This could fail , if the
* rel has been flushed from the relcache since we dirtied
* the page . That should be uncommon , so paying the extra
* cost of a blind write when it happens seems OK .
*/
reln = RelationIdCacheGetRelation ( bufrel ) ;
/*
@ -970,14 +1098,11 @@ BufferSync()
if ( bufHdr - > flags & BM_IO_ERROR )
{
elog ( ERROR , " BufferSync: write error %u for %s " ,
bufHdr - > tag . blockNum , bufHdr - > sb_ relname) ;
bufHdr - > tag . blockNum , bufHdr - > blind . relname ) ;
}
/* drop refcnt from RelationIdCacheGetRelation */
if ( reln ! = ( Relation ) NULL )
RelationDecrementReferenceCount ( reln ) ;
continue ;
}
else
{
/*
* To check if block content changed while flushing ( see
* below ) . - vadim 01 / 17 / 97
@ -987,19 +1112,20 @@ BufferSync()
StartBufferIO ( bufHdr , false ) ; /* output IO start */
/*
* If we didn ' t have the reldesc in our local cache , flush
* If we didn ' t have the reldesc in our local cache , write
* this page out using the ' blind write ' storage manager
* routine . If we did find it , use the standard
* interface .
*/
# ifndef OPTIMIZE_SINGLE
SpinRelease ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
if ( reln = = ( Relation ) NULL )
{
status = smgrblindwrt ( DEFAULT_SMGR , bufHdr - > sb_dbname ,
bufHdr - > sb_relname , bufdb , bufrel ,
status = smgrblindwrt ( DEFAULT_SMGR ,
bufHdr - > blind . dbname ,
bufHdr - > blind . relname ,
bufdb , bufrel ,
bufHdr - > tag . blockNum ,
( char * ) MAKE_PTR ( bufHdr - > data ) ) ;
}
@ -1018,26 +1144,68 @@ BufferSync()
{
bufHdr - > flags | = BM_IO_ERROR ;
elog ( ERROR , " BufferSync: cannot write %u for %s " ,
bufHdr - > tag . blockNum , bufHdr - > sb_ relname) ;
bufHdr - > tag . blockNum , bufHdr - > blind . relname ) ;
}
bufHdr - > flags & = ~ BM_IO_IN_PROGRESS ; /* mark IO finished */
TerminateBufferIO ( bufHdr ) ; /* Sync IO finished */
BufferFlushCount + + ;
didwrite = true ;
/*
* If this buffer was marked by someone as DIRTY while we
* were flushing it out we must not clear DIRTY flag -
* vadim 01 / 17 / 97
*
* but it is OK to clear BufferDirtiedByMe - tgl 3 / 31 / 00
*/
if ( ! ( bufHdr - > flags & BM_JUST_DIRTIED ) )
bufHdr - > flags & = ~ BM_DIRTY ;
/* drop refcnt from RelationIdCacheGetRelation */
}
/* drop refcnt obtained by RelationIdCacheGetRelation */
if ( reln ! = ( Relation ) NULL )
RelationDecrementReferenceCount ( reln ) ;
}
}
/*
* If we did not write the buffer ( because someone else did ) ,
* we must still fsync the file containing it , to ensure that the
* write is down to disk before we commit .
*/
if ( ! didwrite )
{
# ifndef OPTIMIZE_SINGLE
SpinRelease ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
reln = RelationIdCacheGetRelation ( BufferTagLastDirtied [ i ] . relId . relId ) ;
if ( reln = = ( Relation ) NULL )
{
status = smgrblindmarkdirty ( DEFAULT_SMGR ,
BufferBlindLastDirtied [ i ] . dbname ,
BufferBlindLastDirtied [ i ] . relname ,
BufferTagLastDirtied [ i ] . relId . dbId ,
BufferTagLastDirtied [ i ] . relId . relId ,
BufferTagLastDirtied [ i ] . blockNum ) ;
}
else
{
status = smgrmarkdirty ( DEFAULT_SMGR , reln ,
BufferTagLastDirtied [ i ] . blockNum ) ;
/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
RelationDecrementReferenceCount ( reln ) ;
}
# ifndef OPTIMIZE_SINGLE
SpinAcquire ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
}
BufferDirtiedByMe [ i ] = false ;
SpinRelease ( BufMgrLock ) ;
}
LocalBufferSync ( ) ;
}
@ -1166,13 +1334,19 @@ ResetBufferUsage()
/* ----------------------------------------------
* ResetBufferPool
*
* t his routine is supposed to be called when a transaction aborts .
* T his routine is supposed to be called when a transaction aborts .
* it will release all the buffer pins held by the transaction .
* Currently , we also call it during commit if BufferPoolCheckLeak
* detected a problem - - - in that case , isCommit is TRUE , and we
* only clean up buffer pin counts .
*
* During abort , we also forget any pending fsync requests . Dirtied buffers
* will still get written , eventually , but there will be no fsync for them .
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
void
ResetBufferPool ( )
ResetBufferPool ( bool isCommit )
{
int i ;
@ -1193,10 +1367,15 @@ ResetBufferPool()
SpinRelease ( BufMgrLock ) ;
}
PrivateRefCount [ i ] = 0 ;
CommitInfoNeedsSave [ i ] = 0 ;
if ( ! isCommit )
BufferDirtiedByMe [ i ] = false ;
}
ResetLocalBufferPool ( ) ;
if ( ! isCommit )
smgrabort ( ) ;
}
/* -----------------------------------------------
@ -1222,7 +1401,7 @@ BufferPoolCheckLeak()
" Buffer Leak: [%03d] (freeNext=%ld, freePrev=%ld, \
relname = % s , blockNum = % d , flags = 0 x % x , refcount = % d % ld ) " ,
i - 1 , buf - > freeNext , buf - > freePrev ,
buf - > sb_ relname, buf - > tag . blockNum , buf - > flags ,
buf - > blind . relname , buf - > tag . blockNum , buf - > flags ,
buf - > refcount , PrivateRefCount [ i - 1 ] ) ;
result = 1 ;
}
@ -1306,25 +1485,25 @@ BufferGetRelation(Buffer buffer)
/*
* BufferReplace
*
* Flush the buffer corresponding to ' bufHdr '
* Write out the buffer corresponding to ' bufHdr '
*
* This routine used to flush the data to disk ( ie , force immediate fsync )
* but that ' s no longer necessary because BufferSync is smarter than before .
*
* BufMgrLock must be held at entry , and the buffer must be pinned .
*/
static int
BufferReplace ( BufferDesc * bufHdr , bool bufferLockHeld )
BufferReplace ( BufferDesc * bufHdr )
{
Relation reln ;
Oid bufdb ,
bufrel ;
int status ;
if ( ! bufferLockHeld )
SpinAcquire ( BufMgrLock ) ;
/*
* first try to find the reldesc in the cache , if no luck , don ' t
* bother to build the reldesc from scratch , just do a blind write .
*/
bufdb = bufHdr - > tag . relId . dbId ;
bufrel = bufHdr - > tag . relId . relId ;
@ -1336,22 +1515,27 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
/* To check if block content changed while flushing. - vadim 01/17/97 */
bufHdr - > flags & = ~ BM_JUST_DIRTIED ;
# ifndef OPTIMIZE_SINGLE
SpinRelease ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
if ( reln ! = ( Relation ) NULL )
{
status = smgrflush ( DEFAULT_SMGR , reln , bufHdr - > tag . blockNum ,
status = smgrwrite ( DEFAULT_SMGR , reln , bufHdr - > tag . blockNum ,
( char * ) MAKE_PTR ( bufHdr - > data ) ) ;
}
else
{
/* blind write always flushes */
status = smgrblindwrt ( DEFAULT_SMGR , bufHdr - > sb_dbname ,
bufHdr - > sb_relname , bufdb , bufrel ,
status = smgrblindwrt ( DEFAULT_SMGR , bufHdr - > blind . dbname ,
bufHdr - > blind . relname , bufdb , bufrel ,
bufHdr - > tag . blockNum ,
( char * ) MAKE_PTR ( bufHdr - > data ) ) ;
}
# ifndef OPTIMIZE_SINGLE
SpinAcquire ( BufMgrLock ) ;
# endif /* OPTIMIZE_SINGLE */
/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
if ( reln ! = ( Relation ) NULL )
RelationDecrementReferenceCount ( reln ) ;
@ -1359,6 +1543,11 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
if ( status = = SM_FAIL )
return FALSE ;
/* If we had marked this buffer as needing to be fsync'd, we can forget
* about that , because it ' s now the storage manager ' s responsibility .
*/
ClearBufferDirtiedByMe ( BufferDescriptorGetBuffer ( bufHdr ) , bufHdr ) ;
BufferFlushCount + + ;
return TRUE ;
@ -1440,7 +1629,7 @@ ReleaseRelationBuffers(Relation rel)
}
/* Now we can do what we came for */
buf - > flags & = ~ ( BM_DIRTY | BM_JUST_DIRTIED ) ;
CommitInfoNeedsSave [ i - 1 ] = 0 ;
ClearBufferDirtiedByMe ( i , buf ) ;
/*
* Release any refcount we may have .
*
@ -1502,6 +1691,7 @@ DropBuffers(Oid dbid)
}
/* Now we can do what we came for */
buf - > flags & = ~ ( BM_DIRTY | BM_JUST_DIRTIED ) ;
ClearBufferDirtiedByMe ( i , buf ) ;
/*
* The thing should be free , if caller has checked that
* no backends are running in that database .
@ -1533,7 +1723,7 @@ PrintBufferDescs()
elog ( DEBUG , " [%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum = % d , flags = 0 x % x , refcount = % d % ld ) " ,
i , buf - > freeNext , buf - > freePrev ,
buf - > sb_ relname, buf - > tag . blockNum , buf - > flags ,
buf - > blind . relname , buf - > tag . blockNum , buf - > flags ,
buf - > refcount , PrivateRefCount [ i ] ) ;
}
SpinRelease ( BufMgrLock ) ;
@ -1544,7 +1734,7 @@ blockNum=%d, flags=0x%x, refcount=%d %ld)",
for ( i = 0 ; i < NBuffers ; + + i , + + buf )
{
printf ( " [%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld) \n " ,
i , buf - > sb_ relname, buf - > tag . blockNum ,
i , buf - > blind . relname , buf - > tag . blockNum ,
buf - > flags , buf - > refcount , PrivateRefCount [ i ] ) ;
}
}
@ -1562,7 +1752,7 @@ PrintPinnedBufs()
if ( PrivateRefCount [ i ] > 0 )
elog ( NOTICE , " [%02d] (freeNext=%ld, freePrev=%ld, relname=%s, \
blockNum = % d , flags = 0 x % x , refcount = % d % ld ) \ n " ,
i , buf - > freeNext , buf - > freePrev , buf - > sb_ relname,
i , buf - > freeNext , buf - > freePrev , buf - > blind . relname ,
buf - > tag . blockNum , buf - > flags ,
buf - > refcount , PrivateRefCount [ i ] ) ;
}
@ -1601,33 +1791,42 @@ BufferPoolBlowaway()
* FlushRelationBuffers
*
* This function removes from the buffer pool all pages of a relation
* that have blocknumber > = specified block . If doFlush is true ,
* dirty buffers are written out - - - otherwise it ' s an error for any
* of the buffers to be dirty .
* that have blocknumber > = specified block . Pages that are dirty are
* written out first . If expectDirty is false , a notice is emitted
* warning of dirty buffers , but we proceed anyway . An error code is
* returned if we fail to dump a dirty buffer or if we find one of
* the target pages is pinned into the cache .
*
* This is used by VACUUM before truncating the relation to the given
* number of blocks . For VACUUM , we pass doFlush = false since it would
* mean a bug in VACUUM if any of the unwanted pages were still dirty .
* ( TRUNCATE TABLE also uses it in the same way . )
* number of blocks . For VACUUM , we pass expectDirty = false since it
* could mean a bug in VACUUM if any of the unwanted pages were still
* dirty . ( TRUNCATE TABLE also uses it in the same way . )
*
* This is also used by RENAME TABLE ( with block = 0 and doFlush = true )
* This is also used by RENAME TABLE ( with block = 0 and expectDirty = true )
* to clear out the buffer cache before renaming the physical files of
* a relation . Without that , some other backend might try to do a
* blind write of a buffer page ( relying on the sb_relname of the buffer )
* blind write of a buffer page ( relying on the BlindId of the buffer )
* and fail because it ' s not got the right filename anymore .
*
* In both cases , the caller should be holding AccessExclusiveLock on
* the target relation to ensure that no other backend is busy reading
* more blocks of the relation . . .
* more blocks of the relation .
*
* Returns : 0 - Ok , - 1 - DIRTY , - 2 - PINNED
* Formerly , we considered it an error condition if we found unexpectedly
* dirty buffers . However , since BufferSync no longer forces out all
* dirty buffers at every xact commit , it ' s possible for dirty buffers
* to still be present in the cache due to failure of an earlier
* transaction . So , downgrade the error to a mere notice . Maybe we
* shouldn ' t even emit a notice . . .
*
* Returns : 0 - Ok , - 1 - FAILED TO WRITE DIRTY BUFFER , - 2 - PINNED
*
* XXX currently it sequentially searches the buffer pool , should be
* changed to more clever ways of searching .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
int
FlushRelationBuffers ( Relation rel , BlockNumber block , bool doFlush )
FlushRelationBuffers ( Relation rel , BlockNumber block , bool expectDirty )
{
int i ;
BufferDesc * buf ;
@ -1642,19 +1841,13 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
{
if ( buf - > flags & BM_DIRTY )
{
if ( doFlush )
{
if ( FlushBuffer ( - i - 1 , false ) ! = STATUS_OK )
{
elog ( NOTICE , " FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it " ,
if ( ! expectDirty )
elog ( NOTICE , " FlushRelationBuffers(%s (local), %u): block %u is dirty " ,
RelationGetRelationName ( rel ) ,
block , buf - > tag . blockNum ) ;
return - 1 ;
}
}
else
if ( FlushBuffer ( - i - 1 , false ) ! = STATUS_OK )
{
elog ( NOTICE , " FlushRelationBuffers(%s (local), %u): block %u is dirty " ,
elog ( NOTICE , " FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it " ,
RelationGetRelationName ( rel ) ,
block , buf - > tag . blockNum ) ;
return - 1 ;
@ -1676,39 +1869,42 @@ FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
SpinAcquire ( BufMgrLock ) ;
for ( i = 0 ; i < NBuffers ; i + + )
{
recheck :
buf = & BufferDescriptors [ i ] ;
if ( buf - > tag . relId . dbId = = MyDatabaseId & &
buf - > tag . relId . relId = = RelationGetRelid ( rel ) & &
if ( buf - > tag . relId . relId = = RelationGetRelid ( rel ) & &
( buf - > tag . relId . dbId = = MyDatabaseId | |
buf - > tag . relId . dbId = = ( Oid ) NULL ) & &
buf - > tag . blockNum > = block )
{
if ( buf - > flags & BM_DIRTY )
{
if ( doFlush )
{
PinBuffer ( buf ) ;
SpinRelease ( BufMgrLock ) ;
if ( FlushBuffer ( i + 1 , false ) ! = STATUS_OK )
if ( ! expectDirty )
elog ( NOTICE , " FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d) " ,
RelationGetRelationName ( rel ) , block ,
buf - > tag . blockNum ,
PrivateRefCount [ i ] , buf - > refcount ) ;
if ( FlushBuffer ( i + 1 , true ) ! = STATUS_OK )
{
elog ( NOTICE , " FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d), could not flush it " ,
buf - > sb_relname , block , buf - > tag . blockNum ,
RelationGetRelationName ( rel ) , block ,
buf - > tag . blockNum ,
PrivateRefCount [ i ] , buf - > refcount ) ;
return - 1 ;
}
SpinAcquire ( BufMgrLock ) ;
}
else
{
SpinRelease ( BufMgrLock ) ;
elog ( NOTICE , " FlushRelationBuffers(%s, %u): block %u is dirty (private %ld, global %d) " ,
buf - > sb_relname , block , buf - > tag . blockNum ,
PrivateRefCount [ i ] , buf - > refcount ) ;
return - 1 ;
}
/* Buffer could already be reassigned, so must recheck
* whether it still belongs to rel before freeing it !
*/
goto recheck ;
}
if ( ! ( buf - > flags & BM_FREE ) )
{
SpinRelease ( BufMgrLock ) ;
elog ( NOTICE , " FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d) " ,
buf - > sb_relname , block , buf - > tag . blockNum ,
RelationGetRelationName ( rel ) , block ,
buf - > tag . blockNum ,
PrivateRefCount [ i ] , buf - > refcount ) ;
return - 2 ;
}
@ -1755,11 +1951,6 @@ ReleaseBuffer(Buffer buffer)
AddBufferToFreelist ( bufHdr ) ;
bufHdr - > flags | = BM_FREE ;
}
if ( CommitInfoNeedsSave [ buffer - 1 ] )
{
bufHdr - > flags | = ( BM_DIRTY | BM_JUST_DIRTIED ) ;
CommitInfoNeedsSave [ buffer - 1 ] = 0 ;
}
SpinRelease ( BufMgrLock ) ;
}
@ -1777,7 +1968,7 @@ IncrBufferRefCount_Debug(char *file, int line, Buffer buffer)
fprintf ( stderr , " PIN(Incr) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
buffer , buf - > sb_ relname, buf - > tag . blockNum ,
buffer , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ buffer - 1 ] , file , line ) ;
}
}
@ -1795,7 +1986,7 @@ ReleaseBuffer_Debug(char *file, int line, Buffer buffer)
fprintf ( stderr , " UNPIN(Rel) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
buffer , buf - > sb_ relname, buf - > tag . blockNum ,
buffer , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ buffer - 1 ] , file , line ) ;
}
}
@ -1822,7 +2013,7 @@ ReleaseAndReadBuffer_Debug(char *file,
fprintf ( stderr , " UNPIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
buffer , buf - > sb_ relname, buf - > tag . blockNum ,
buffer , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ buffer - 1 ] , file , line ) ;
}
if ( ShowPinTrace & & BufferIsLocal ( buffer ) & & is_userbuffer ( buffer ) )
@ -1831,7 +2022,7 @@ refcount = %ld, file: %s, line: %d\n",
fprintf ( stderr , " PIN(Rel&Rd) %ld relname = %s, blockNum = %d, \
refcount = % ld , file : % s , line : % d \ n " ,
b , buf - > sb_ relname, buf - > tag . blockNum ,
b , buf - > blind . relname , buf - > tag . blockNum ,
PrivateRefCount [ b - 1 ] , file , line ) ;
}
return b ;
@ -1983,11 +2174,43 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
# endif /* BMTRACE */
/*
* SetBufferCommitInfoNeedsSave
*
* Mark a buffer dirty when we have updated tuple commit - status bits in it .
*
* This is similar to WriteNoReleaseBuffer , except that we do not set
* SharedBufferChanged or BufferDirtiedByMe , because we have not made a
* critical change that has to be flushed to disk before xact commit - - - the
* status - bit update could be redone by someone else just as easily . The
* buffer will be marked dirty , but it will not be written to disk until
* there is another reason to write it .
*
* This routine might get called many times on the same page , if we are making
* the first scan after commit of an xact that added / deleted many tuples .
* So , be as quick as we can if the buffer is already dirty .
*/
void
SetBufferCommitInfoNeedsSave ( Buffer buffer )
{
if ( ! BufferIsLocal ( buffer ) )
CommitInfoNeedsSave [ buffer - 1 ] + + ;
BufferDesc * bufHdr ;
if ( BufferIsLocal ( buffer ) )
return ;
if ( BAD_BUFFER_ID ( buffer ) )
return ;
bufHdr = & BufferDescriptors [ buffer - 1 ] ;
if ( ( bufHdr - > flags & ( BM_DIRTY | BM_JUST_DIRTIED ) ) ! =
( BM_DIRTY | BM_JUST_DIRTIED ) )
{
SpinAcquire ( BufMgrLock ) ;
Assert ( bufHdr - > refcount > 0 ) ;
bufHdr - > flags | = ( BM_DIRTY | BM_JUST_DIRTIED ) ;
SpinRelease ( BufMgrLock ) ;
}
}
void
@ -2175,7 +2398,16 @@ static void StartBufferIO(BufferDesc *buf, bool forInput)
Assert ( ! ( buf - > flags & BM_IO_IN_PROGRESS ) ) ;
buf - > flags | = BM_IO_IN_PROGRESS ;
# ifdef HAS_TEST_AND_SET
Assert ( S_LOCK_FREE ( & ( buf - > io_in_progress_lock ) ) )
/*
* There used to be
*
* Assert ( S_LOCK_FREE ( & ( buf - > io_in_progress_lock ) ) ) ;
*
* here , but that ' s wrong because of the way WaitIO works : someone else
* waiting for the I / O to complete will succeed in grabbing the lock for
* a few instructions , and if we context - swap back to here the Assert
* could fail . Tiny window for failure , but I ' ve seen it happen - - tgl
*/
S_LOCK ( & ( buf - > io_in_progress_lock ) ) ;
# endif /* HAS_TEST_AND_SET */
InProgressBuf = buf ;
@ -2217,7 +2449,7 @@ static void ContinueBufferIO(BufferDesc *buf, bool forInput)
IsForInput = forInput ;
}
extern void InitBufferIO ( void )
void InitBufferIO ( void )
{
InProgressBuf = ( BufferDesc * ) 0 ;
}
@ -2229,7 +2461,7 @@ extern void InitBufferIO(void)
* set in case of output , this routine would kill all
* backends and reset postmaster .
*/
extern void AbortBufferIO ( void )
void AbortBufferIO ( void )
{
BufferDesc * buf = InProgressBuf ;
if ( buf )
@ -2252,8 +2484,8 @@ extern void AbortBufferIO(void)
buf - > flags | = BM_DIRTY ;
}
buf - > flags | = BM_IO_ERROR ;
TerminateBufferIO ( buf ) ;
buf - > flags & = ~ BM_IO_IN_PROGRESS ;
TerminateBufferIO ( buf ) ;
SpinRelease ( BufMgrLock ) ;
}
}