@ -306,7 +306,10 @@ static void walkdir(const char *path,
# ifdef PG_FLUSH_DATA_WORKS
# ifdef PG_FLUSH_DATA_WORKS
static void pre_sync_fname ( const char * fname , bool isdir , int elevel ) ;
static void pre_sync_fname ( const char * fname , bool isdir , int elevel ) ;
# endif
# endif
static void fsync_fname_ext ( const char * fname , bool isdir , int elevel ) ;
static void datadir_fsync_fname ( const char * fname , bool isdir , int elevel ) ;
static int fsync_fname_ext ( const char * fname , bool isdir , bool ignore_perm , int elevel ) ;
static int fsync_parent_path ( const char * fname , int elevel ) ;
/*
/*
@ -413,54 +416,158 @@ pg_flush_data(int fd, off_t offset, off_t amount)
* indicate the OS just doesn ' t allow / require fsyncing directories .
* indicate the OS just doesn ' t allow / require fsyncing directories .
*/
*/
void
void
fsync_fname ( char * fname , bool isdir )
fsync_fname ( const char * fname , bool isdir )
{
fsync_fname_ext ( fname , isdir , false , ERROR ) ;
}
/*
* durable_rename - - rename ( 2 ) wrapper , issuing fsyncs required for durability
*
* This routine ensures that , after returning , the effect of renaming file
* persists in case of a crash . A crash while this routine is running will
* leave you with either the pre - existing or the moved file in place of the
* new file ; no mixed state or truncated files are possible .
*
* It does so by using fsync on the old filename and the possibly existing
* target filename before the rename , and the target file and directory after .
*
* Note that rename ( ) cannot be used across arbitrary directories , as they
* might not be on the same filesystem . Therefore this routine does not
* support renaming across directories .
*
* Log errors with the caller specified severity .
*
* Returns 0 if the operation succeeded , - 1 otherwise . Note that errno is not
* valid upon return .
*/
int
durable_rename ( const char * oldfile , const char * newfile , int elevel )
{
{
int fd ;
int fd ;
int returncode ;
/*
/*
* Some OSs require directories to be opened read - only whereas other
* First fsync the old and target path ( if it exists ) , to ensure that they
* systems don ' t allow us to fsync files opened read - only ; so we need both
* are properly persistent on disk . Syncing the target file is not
* cases here
* strictly necessary , but it makes it easier to reason about crashes ;
* because it ' s then guaranteed that either source or target file exists
* after a crash .
*/
*/
if ( ! isdir )
if ( fsync_fname_ext ( oldfile , false , false , elevel ) ! = 0 )
fd = OpenTransientFile ( fname ,
return - 1 ;
O_RDWR | PG_BINARY ,
S_IRUSR | S_IWUSR ) ;
fd = OpenTransientFile ( ( char * ) newfile , PG_BINARY | O_RDWR , 0 ) ;
if ( fd < 0 )
{
if ( errno ! = ENOENT )
{
ereport ( elevel ,
( errcode_for_file_access ( ) ,
errmsg ( " could not open file \" %s \" : %m " , newfile ) ) ) ;
return - 1 ;
}
}
else
else
fd = OpenTransientFile ( fname ,
{
O_RDONLY | PG_BINARY ,
if ( pg_fsync ( fd ) ! = 0 )
S_IRUSR | S_IWUSR ) ;
{
int save_errno ;
/* close file upon error, might not be in transaction context */
save_errno = errno ;
CloseTransientFile ( fd ) ;
errno = save_errno ;
ereport ( elevel ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync file \" %s \" : %m " , newfile ) ) ) ;
return - 1 ;
}
CloseTransientFile ( fd ) ;
}
/* Time to do the real deal... */
if ( rename ( oldfile , newfile ) < 0 )
{
ereport ( elevel ,
( errcode_for_file_access ( ) ,
errmsg ( " could not rename file \" %s \" to \" %s \" : %m " ,
oldfile , newfile ) ) ) ;
return - 1 ;
}
/*
/*
* Some OSs don ' t allow us to open directories at all ( Windows returns
* To guarantee renaming the file is persistent , fsync the file with it s
* EACCES )
* new name , and its containing directory .
*/
*/
if ( fd < 0 & & isdir & & ( errno = = EISDIR | | errno = = EACCES ) )
if ( fsync_fname_ext ( newfile , false , false , elevel ) ! = 0 )
return ;
return - 1 ;
else if ( fd < 0 )
if ( fsync_parent_path ( newfile , elevel ) ! = 0 )
ereport ( ERROR ,
return - 1 ;
( errcode_for_file_access ( ) ,
errmsg ( " could not open file \" %s \" : %m " , fname ) ) ) ;
returncode = pg_fsync ( fd ) ;
return 0 ;
}
/*
* durable_link_or_rename - - rename a file in a durable manner .
*
* Similar to durable_rename ( ) , except that this routine tries ( but does not
* guarantee ) not to overwrite the target file .
*
* Note that a crash in an unfortunate moment can leave you with two links to
* the target file .
*
* Log errors with the caller specified severity .
*
* Returns 0 if the operation succeeded , - 1 otherwise . Note that errno is not
* valid upon return .
*/
int
durable_link_or_rename ( const char * oldfile , const char * newfile , int elevel )
{
/*
* Ensure that , if we crash directly after the rename / link , a file with
* valid contents is moved into place .
*/
if ( fsync_fname_ext ( oldfile , false , false , elevel ) ! = 0 )
return - 1 ;
/* Some OSs don't allow us to fsync directories at all */
# if HAVE_WORKING_LINK
if ( returncode ! = 0 & & isdir & & errno = = EBADF )
if ( link ( oldfile , newfile ) < 0 )
{
{
CloseTransientFile ( fd ) ;
ereport ( elevel ,
return ;
( errcode_for_file_access ( ) ,
errmsg ( " could not link file \" %s \" to \" %s \" : %m " ,
oldfile , newfile ) ) ) ;
return - 1 ;
}
}
unlink ( oldfile ) ;
if ( returncode ! = 0 )
# else
ereport ( ERROR ,
/* XXX: Add racy file existence check? */
if ( rename ( oldfile , newfile ) < 0 )
{
ereport ( elevel ,
( errcode_for_file_access ( ) ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync file \" %s \" : %m " , fname ) ) ) ;
errmsg ( " could not rename file \" %s \" to \" %s \" : %m " ,
oldfile , newfile ) ) ) ;
return - 1 ;
}
# endif
CloseTransientFile ( fd ) ;
/*
}
* Make change persistent in case of an OS crash , both the new entry and
* its parent directory need to be flushed .
*/
if ( fsync_fname_ext ( newfile , false , false , elevel ) ! = 0 )
return - 1 ;
/* Same for parent directory */
if ( fsync_parent_path ( newfile , elevel ) ! = 0 )
return - 1 ;
return 0 ;
}
/*
/*
* InitFileAccess - - - initialize this module during backend startup
* InitFileAccess - - - initialize this module during backend startup
@ -2581,10 +2688,10 @@ SyncDataDirectory(void)
* in pg_tblspc , they ' ll get fsync ' d twice . That ' s not an expected case
* in pg_tblspc , they ' ll get fsync ' d twice . That ' s not an expected case
* so we don ' t worry about optimizing it .
* so we don ' t worry about optimizing it .
*/
*/
walkdir ( " . " , fsync_fname_ext , false , LOG ) ;
walkdir ( " . " , datadir_ fsync_fname, false , LOG ) ;
if ( xlog_is_symlink )
if ( xlog_is_symlink )
walkdir ( " pg_xlog " , fsync_fname_ext , false , LOG ) ;
walkdir ( " pg_xlog " , datadir_ fsync_fname, false , LOG ) ;
walkdir ( " pg_tblspc " , fsync_fname_ext , true , LOG ) ;
walkdir ( " pg_tblspc " , datadir_ fsync_fname, true , LOG ) ;
}
}
/*
/*
@ -2698,15 +2805,26 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
# endif /* PG_FLUSH_DATA_WORKS */
# endif /* PG_FLUSH_DATA_WORKS */
static void
datadir_fsync_fname ( const char * fname , bool isdir , int elevel )
{
/*
* We want to silently ignoring errors about unreadable files . Pass that
* desire on to fsync_fname_ext ( ) .
*/
fsync_fname_ext ( fname , isdir , true , elevel ) ;
}
/*
/*
* fsync_fname_ext - - Try to fsync a file or directory
* fsync_fname_ext - - Try to fsync a file or directory
*
*
* Ignores errors trying to open unreadable files , or trying to fsync
* If ignore_perm is true , ignore errors upon trying to open unreadable
* directories on systems where that isn ' t allowed / required , and logs other
* files . Logs other errors at a caller - specified level .
* errors at a caller - specified level .
*
* Returns 0 if the operation succeeded , - 1 otherwise .
*/
*/
static void
static int
fsync_fname_ext ( const char * fname , bool isdir , int elevel )
fsync_fname_ext ( const char * fname , bool isdir , bool ignore_perm , int elevel )
{
{
int fd ;
int fd ;
int flags ;
int flags ;
@ -2724,20 +2842,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
else
else
flags | = O_RDONLY ;
flags | = O_RDONLY ;
fd = OpenTransientFile ( ( char * ) fname , flags , 0 ) ;
/*
/*
* Open the file , silently ignoring errors about unreadable files ( or
* Some OSs don ' t allow us to open directories at all ( Windows returns
* unsupported operations , e . g . opening a directory under Windows ) , and
* EACCES ) , just ignore the error in that case . If desired also silently
* loggin g others .
* ignoring errors about unreadable files . Lo g others .
*/
*/
fd = OpenTransientFile ( ( char * ) fname , flags , 0 ) ;
if ( fd < 0 & & isdir & & ( errno = = EISDIR | | errno = = EACCES ) )
if ( fd < 0 )
return 0 ;
else if ( fd < 0 & & ignore_perm & & errno = = EACCES )
return 0 ;
else if ( fd < 0 )
{
{
if ( errno = = EACCES | | ( isdir & & errno = = EISDIR ) )
return ;
ereport ( elevel ,
ereport ( elevel ,
( errcode_for_file_access ( ) ,
( errcode_for_file_access ( ) ,
errmsg ( " could not open file \" %s \" : %m " , fname ) ) ) ;
errmsg ( " could not open file \" %s \" : %m " , fname ) ) ) ;
return ;
return - 1 ;
}
}
returncode = pg_fsync ( fd ) ;
returncode = pg_fsync ( fd ) ;
@ -2747,9 +2868,49 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
* those errors . Anything else needs to be logged .
* those errors . Anything else needs to be logged .
*/
*/
if ( returncode ! = 0 & & ! ( isdir & & errno = = EBADF ) )
if ( returncode ! = 0 & & ! ( isdir & & errno = = EBADF ) )
{
int save_errno ;
/* close file upon error, might not be in transaction context */
save_errno = errno ;
( void ) CloseTransientFile ( fd ) ;
errno = save_errno ;
ereport ( elevel ,
ereport ( elevel ,
( errcode_for_file_access ( ) ,
( errcode_for_file_access ( ) ,
errmsg ( " could not fsync file \" %s \" : %m " , fname ) ) ) ;
errmsg ( " could not fsync file \" %s \" : %m " , fname ) ) ) ;
return - 1 ;
}
( void ) CloseTransientFile ( fd ) ;
( void ) CloseTransientFile ( fd ) ;
return 0 ;
}
/*
* fsync_parent_path - - fsync the parent path of a file or directory
*
* This is aimed at making file operations persistent on disk in case of
* an OS crash or power failure .
*/
static int
fsync_parent_path ( const char * fname , int elevel )
{
char parentpath [ MAXPGPATH ] ;
strlcpy ( parentpath , fname , MAXPGPATH ) ;
get_parent_directory ( parentpath ) ;
/*
* get_parent_directory ( ) returns an empty string if the input argument is
* just a file name ( see comments in path . c ) , so handle that as being the
* current directory .
*/
if ( strlen ( parentpath ) = = 0 )
strlcpy ( parentpath , " . " , MAXPGPATH ) ;
if ( fsync_fname_ext ( parentpath , true , false , elevel ) ! = 0 )
return - 1 ;
return 0 ;
}
}