@ -8,7 +8,7 @@
*
*
* IDENTIFICATION
* $ PostgreSQL : pgsql / src / backend / storage / smgr / md . c , v 1.118 .2 .2 2007 / 01 / 27 20 : 15 : 55 tgl Exp $
* $ PostgreSQL : pgsql / src / backend / storage / smgr / md . c , v 1.118 .2 .3 2007 / 04 / 12 17 : 11 : 07 tgl Exp $
*
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*/
@ -122,14 +122,19 @@ typedef struct
BlockNumber segno ; /* which segment */
} PendingOperationTag ;
typedef uint16 CycleCtr ; /* can be any convenient integer size */
typedef struct
{
PendingOperationTag tag ; /* hash table key (must be first!) */
int failures ; /* number of failed attempts to fsync */
bool canceled ; /* T => request canceled, not yet removed */
CycleCtr cycle_ctr ; /* mdsync_cycle_ctr when request was made */
} PendingOperationEntry ;
static HTAB * pendingOpsTable = NULL ;
static CycleCtr mdsync_cycle_ctr = 0 ;
/* local routines */
static MdfdVec * mdopen ( SMgrRelation reln , bool allowNotFound ) ;
@ -749,70 +754,125 @@ mdimmedsync(SMgrRelation reln)
/*
* mdsync ( ) - - Sync previous writes to stable storage .
*
* This is only called during checkpoints , and checkpoints should only
* occur in processes that have created a pendingOpsTable .
*/
bool
mdsync ( void )
{
bool need_retry ;
static bool mdsync_in_progress = false ;
HASH_SEQ_STATUS hstat ;
PendingOperationEntry * entry ;
int absorb_counter ;
/*
* This is only called during checkpoints , and checkpoints should only
* occur in processes that have created a pendingOpsTable .
*/
if ( ! pendingOpsTable )
return false ;
/*
* The fsync table could contain requests to fsync relations that have
* been deleted ( unlinked ) by the time we get to them . Rather than
* just hoping an ENOENT ( or EACCES on Windows ) error can be ignored ,
* what we will do is retry the whole process after absorbing fsync
* request messages again . Since mdunlink ( ) queues a " revoke " message
* before actually unlinking , the fsync request is guaranteed to be gone
* the second time if it really was this case . DROP DATABASE likewise
* has to tell us to forget fsync requests before it starts deletions .
* If we are in the bgwriter , the sync had better include all fsync
* requests that were queued by backends before the checkpoint REDO
* point was determined . We go that a little better by accepting all
* requests queued up to the point where we start fsync ' ing .
*/
do {
HASH_SEQ_STATUS hstat ;
PendingOperationEntry * entry ;
int absorb_counter ;
AbsorbFsyncRequests ( ) ;
/*
* To avoid excess fsync ' ing ( in the worst case , maybe a never - terminating
* checkpoint ) , we want to ignore fsync requests that are entered into the
* hashtable after this point - - - they should be processed next time ,
* instead . We use mdsync_cycle_ctr to tell old entries apart from new
* ones : new ones will have cycle_ctr equal to the incremented value of
* mdsync_cycle_ctr .
*
* In normal circumstances , all entries present in the table at this
* point will have cycle_ctr exactly equal to the current ( about to be old )
* value of mdsync_cycle_ctr . However , if we fail partway through the
* fsync ' ing loop , then older values of cycle_ctr might remain when we
* come back here to try again . Repeated checkpoint failures would
* eventually wrap the counter around to the point where an old entry
* might appear new , causing us to skip it , possibly allowing a checkpoint
* to succeed that should not have . To forestall wraparound , any time
* the previous mdsync ( ) failed to complete , run through the table and
* forcibly set cycle_ctr = mdsync_cycle_ctr .
*
* Think not to merge this loop with the main loop , as the problem is
* exactly that that loop may fail before having visited all the entries .
* From a performance point of view it doesn ' t matter anyway , as this
* path will never be taken in a system that ' s functioning normally .
*/
if ( mdsync_in_progress )
{
/* prior try failed, so update any stale cycle_ctr values */
hash_seq_init ( & hstat , pendingOpsTable ) ;
while ( ( entry = ( PendingOperationEntry * ) hash_seq_search ( & hstat ) ) ! = NULL )
{
entry - > cycle_ctr = mdsync_cycle_ctr ;
}
}
/* Advance counter so that new hashtable entries are distinguishable */
mdsync_cycle_ctr + + ;
need_retry = false ;
/* Set flag to detect failure if we don't reach the end of the loop */
mdsync_in_progress = true ;
/* Now scan the hashtable for fsync requests to process */
absorb_counter = FSYNCS_PER_ABSORB ;
hash_seq_init ( & hstat , pendingOpsTable ) ;
while ( ( entry = ( PendingOperationEntry * ) hash_seq_search ( & hstat ) ) ! = NULL )
{
/*
* If we are in the bgwriter , the sync had better include all fsync
* requests that were queued by backends before the checkpoint REDO
* point was determined . We go that a little better by accepting all
* requests queued up to the point where we start fsync ' ing .
* If the entry is new then don ' t process it this time . Note that
* " continue " bypasses the hash - remove call at the bottom of the loop .
*/
AbsorbFsyncRequests ( ) ;
if ( entry - > cycle_ctr = = mdsync_cycle_ctr )
continue ;
absorb_counter = FSYNCS_PER_ABSORB ;
hash_seq_init ( & hstat , pendingOpsTable ) ;
while ( ( entry = ( PendingOperationEntry * ) hash_seq_search ( & hstat ) ) ! = NULL )
/* Else assert we haven't missed it */
Assert ( ( CycleCtr ) ( entry - > cycle_ctr + 1 ) = = mdsync_cycle_ctr ) ;
/*
* If fsync is off then we don ' t have to bother opening the file
* at all . ( We delay checking until this point so that changing
* fsync on the fly behaves sensibly . ) Also , if the entry is
* marked canceled , fall through to delete it .
*/
if ( enableFsync & & ! entry - > canceled )
{
int failures ;
/*
* If fsync is off then we don ' t have to bother opening the file
* at all . ( We delay checking until this point so that changing
* fsync on the fly behaves sensibly . )
* If in bgwriter , we want to absorb pending requests every so
* often to prevent overflow of the fsync request queue . It is
* unspecified whether newly - added entries will be visited by
* hash_seq_search , but we don ' t care since we don ' t need to
* process them anyway .
*/
if ( enableFsync )
if ( - - absorb_counter < = 0 )
{
AbsorbFsyncRequests ( ) ;
absorb_counter = FSYNCS_PER_ABSORB ;
}
/*
* The fsync table could contain requests to fsync segments that
* have been deleted ( unlinked ) by the time we get to them .
* Rather than just hoping an ENOENT ( or EACCES on Windows ) error
* can be ignored , what we do on error is absorb pending requests
* and then retry . Since mdunlink ( ) queues a " revoke " message
* before actually unlinking , the fsync request is guaranteed to
* be marked canceled after the absorb if it really was this case .
* DROP DATABASE likewise has to tell us to forget fsync requests
* before it starts deletions .
*/
for ( failures = 0 ; ; failures + + ) /* loop exits at "break" */
{
SMgrRelation reln ;
MdfdVec * seg ;
/*
* If in bgwriter , we want to absorb pending requests every so
* often to prevent overflow of the fsync request queue . This
* could result in deleting the current entry out from under
* our hashtable scan , so the procedure is to fall out of the
* scan and start over from the top of the function .
*/
if ( - - absorb_counter < = 0 )
{
need_retry = true ;
break ;
}
/*
* Find or create an smgr hash entry for this relation . This
* may seem a bit unclean - - md calling smgr ? But it ' s really
@ -833,7 +893,7 @@ mdsync(void)
/*
* It is possible that the relation has been dropped or
* truncated since the fsync request was entered . Therefore ,
* allow ENOENT , but only if we didn ' t fail once already on
* allow ENOENT , but only if we didn ' t fail already on
* this file . This applies both during _mdfd_getseg ( ) and
* during FileSync , since fd . c might have closed the file
* behind our back .
@ -841,45 +901,59 @@ mdsync(void)
seg = _mdfd_getseg ( reln ,
entry - > tag . segno * ( ( BlockNumber ) RELSEG_SIZE ) ,
true ) ;
if ( seg = = NULL | |
FileSync ( seg - > mdfd_vfd ) < 0 )
if ( seg ! = NULL & &
FileSync ( seg - > mdfd_vfd ) > = 0 )
break ; /* success; break out of retry loop */
/*
* XXX is there any point in allowing more than one retry ?
* Don ' t see one at the moment , but easy to change the
* test here if so .
*/
if ( ! FILE_POSSIBLY_DELETED ( errno ) | |
failures > 0 )
{
/*
* XXX is there any point in allowing more than one try ?
* Don ' t see one at the moment , but easy to change the
* test here if so .
*/
if ( ! FILE_POSSIBLY_DELETED ( errno ) | |
+ + ( entry - > failures ) > 1 )
{
ereport ( LOG ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync segment %u of relation %u/%u/%u: %m " ,
entry - > tag . segno ,
entry - > tag . rnode . spcNode ,
entry - > tag . rnode . dbNode ,
entry - > tag . rnode . relNode ) ) ) ;
return false ;
}
else
ereport ( DEBUG1 ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync segment %u of relation %u/%u/%u, but retrying: %m " ,
entry - > tag . segno ,
entry - > tag . rnode . spcNode ,
entry - > tag . rnode . dbNode ,
entry - > tag . rnode . relNode ) ) ) ;
need_retry = true ;
continue ; /* don't delete the hashtable entry */
ereport ( LOG ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync segment %u of relation %u/%u/%u: %m " ,
entry - > tag . segno ,
entry - > tag . rnode . spcNode ,
entry - > tag . rnode . dbNode ,
entry - > tag . rnode . relNode ) ) ) ;
return false ;
}
}
else
ereport ( DEBUG1 ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync segment %u of relation %u/%u/%u, but retrying: %m " ,
entry - > tag . segno ,
entry - > tag . rnode . spcNode ,
entry - > tag . rnode . dbNode ,
entry - > tag . rnode . relNode ) ) ) ;
/* Okay, delete this entry */
if ( hash_search ( pendingOpsTable , & entry - > tag ,
HASH_REMOVE , NULL ) = = NULL )
elog ( ERROR , " pendingOpsTable corrupted " ) ;
/*
* Absorb incoming requests and check to see if canceled .
*/
AbsorbFsyncRequests ( ) ;
absorb_counter = FSYNCS_PER_ABSORB ; /* might as well... */
if ( entry - > canceled )
break ;
} /* end retry loop */
}
} while ( need_retry ) ;
/*
* If we get here , either we fsync ' d successfully , or we don ' t have
* to because enableFsync is off , or the entry is ( now ) marked
* canceled . Okay to delete it .
*/
if ( hash_search ( pendingOpsTable , & entry - > tag ,
HASH_REMOVE , NULL ) = = NULL )
elog ( ERROR , " pendingOpsTable corrupted " ) ;
} /* end loop over hashtable entries */
/* Flag successful completion of mdsync */
mdsync_in_progress = false ;
return true ;
}
@ -924,8 +998,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
*
* The range of possible segment numbers is way less than the range of
* BlockNumber , so we can reserve high values of segno for special purposes .
* We define two : FORGET_RELATION_FSYNC means to drop pending fsyncs for
* a relation , and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
* We define two : FORGET_RELATION_FSYNC means to cancel pending fsyncs for
* a relation , and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
* a whole database . ( These are a tad slow because the hash table has to be
* searched linearly , but it doesn ' t seem worth rethinking the table structure
* for them . )
@ -946,10 +1020,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
if ( RelFileNodeEquals ( entry - > tag . rnode , rnode ) )
{
/* Okay, delete this entry */
if ( hash_search ( pendingOpsTable , & entry - > tag ,
HASH_REMOVE , NULL ) = = NULL )
elog ( ERROR , " pendingOpsTable corrupted " ) ;
/* Okay, cancel this entry */
entry - > canceled = true ;
}
}
}
@ -964,10 +1036,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
{
if ( entry - > tag . rnode . dbNode = = rnode . dbNode )
{
/* Okay, delete this entry */
if ( hash_search ( pendingOpsTable , & entry - > tag ,
HASH_REMOVE , NULL ) = = NULL )
elog ( ERROR , " pendingOpsTable corrupted " ) ;
/* Okay, cancel this entry */
entry - > canceled = true ;
}
}
}
@ -987,8 +1057,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
& key ,
HASH_ENTER ,
& found ) ;
if ( ! found ) /* new entry, so initialize it */
entry - > failures = 0 ;
/* if new or previously canceled entry, initialize it */
if ( ! found | | entry - > canceled )
{
entry - > canceled = false ;
entry - > cycle_ctr = mdsync_cycle_ctr ;
}
/*
* NB : it ' s intentional that we don ' t change cycle_ctr if the entry
* already exists . The fsync request must be treated as old , even
* though the new request will be satisfied too by any subsequent
* fsync .
*
* However , if the entry is present but is marked canceled , we should
* act just as though it wasn ' t there . The only case where this could
* happen would be if a file had been deleted , we received but did not
* yet act on the cancel request , and the same relfilenode was then
* assigned to a new file . We mustn ' t lose the new request , but
* it should be considered new not old .
*/
}
}