@ -28,6 +28,7 @@
# include "access/xlog.h"
# include "access/xlog.h"
# include "access/xlogutils.h"
# include "access/xlogutils.h"
# include "commands/tablespace.h"
# include "commands/tablespace.h"
# include "common/file_utils.h"
# include "miscadmin.h"
# include "miscadmin.h"
# include "pg_trace.h"
# include "pg_trace.h"
# include "pgstat.h"
# include "pgstat.h"
@ -754,138 +755,274 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
}
}
/*
/*
* mdread ( ) - - Read the specified block from a relation .
* Convert an array of buffer address into an array of iovec objects , and
* return the number that were required . ' iov ' must have enough space for up
* to ' nblocks ' elements , but the number used may be less depending on
* merging . In the case of a run of fully contiguous buffers , a single iovec
* will be populated that can be handled as a plain non - vectored I / O .
*/
*/
void
static int
mdread ( SMgrRelation reln , ForkNumber forknum , BlockNumber blocknum ,
buffers_to_iovec ( struct iovec * iov , void * * buffers , int nblocks )
void * buffer )
{
{
off_t seekpos ;
struct iovec * iovp ;
int nbytes ;
int iovcnt ;
MdfdVec * v ;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
Assert ( nblocks > = 1 ) ;
if ( PG_O_DIRECT ! = 0 & & PG_IO_ALIGN_SIZE < = BLCKSZ )
Assert ( ( uintptr_t ) buffer = = TYPEALIGN ( PG_IO_ALIGN_SIZE , buffer ) ) ;
TRACE_POSTGRESQL_SMGR_MD_READ_START ( forknum , blocknum ,
/* If this build supports direct I/O, buffers must be I/O aligned. */
reln - > smgr_rlocator . locator . spcOid ,
for ( int i = 0 ; i < nblocks ; + + i )
reln - > smgr_rlocator . locator . dbOid ,
{
reln - > smgr_rlocator . locator . relNumber ,
if ( PG_O_DIRECT ! = 0 & & PG_IO_ALIGN_SIZE < = BLCKSZ )
reln - > smgr_rlocator . backend ) ;
Assert ( ( uintptr_t ) buffers [ i ] = =
TYPEALIGN ( PG_IO_ALIGN_SIZE , buffers [ i ] ) ) ;
v = _mdfd_getseg ( reln , forknum , blocknum , false ,
}
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY ) ;
seekpos = ( off_t ) BLCKSZ * ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ;
/* Start the first iovec off with the first buffer. */
iovp = & iov [ 0 ] ;
iovp - > iov_base = buffers [ 0 ] ;
iovp - > iov_len = BLCKSZ ;
iovcnt = 1 ;
Assert ( seekpos < ( off_t ) BLCKSZ * RELSEG_SIZE ) ;
/* Try to merge the rest. */
for ( int i = 1 ; i < nblocks ; + + i )
{
void * buffer = buffers [ i ] ;
nbytes = FileRead ( v - > mdfd_vfd , buffer , BLCKSZ , seekpos , WAIT_EVENT_DATA_FILE_READ ) ;
if ( ( ( char * ) iovp - > iov_base + iovp - > iov_len ) = = buffer )
{
/* Contiguous with the last iovec. */
iovp - > iov_len + = BLCKSZ ;
}
else
{
/* Need a new iovec. */
iovp + + ;
iovp - > iov_base = buffer ;
iovp - > iov_len = BLCKSZ ;
iovcnt + + ;
}
}
TRACE_POSTGRESQL_SMGR_MD_READ_DONE ( forknum , blocknum ,
return iovcnt ;
reln - > smgr_rlocator . locator . spcOid ,
}
reln - > smgr_rlocator . locator . dbOid ,
reln - > smgr_rlocator . locator . relNumber ,
reln - > smgr_rlocator . backend ,
nbytes ,
BLCKSZ ) ;
if ( nbytes ! = BLCKSZ )
/*
* mdreadv ( ) - - Read the specified blocks from a relation .
*/
void
mdreadv ( SMgrRelation reln , ForkNumber forknum , BlockNumber blocknum ,
void * * buffers , BlockNumber nblocks )
{
while ( nblocks > 0 )
{
{
if ( nbytes < 0 )
struct iovec iov [ PG_IOV_MAX ] ;
ereport ( ERROR ,
int iovcnt ;
( errcode_for_file_access ( ) ,
off_t seekpos ;
errmsg ( " could not read block %u in file \" %s \" : %m " ,
int nbytes ;
blocknum , FilePathName ( v - > mdfd_vfd ) ) ) ) ;
MdfdVec * v ;
BlockNumber nblocks_this_segment ;
size_t transferred_this_segment ;
size_t size_this_segment ;
v = _mdfd_getseg ( reln , forknum , blocknum , false ,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY ) ;
seekpos = ( off_t ) BLCKSZ * ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ;
Assert ( seekpos < ( off_t ) BLCKSZ * RELSEG_SIZE ) ;
nblocks_this_segment =
Min ( nblocks ,
RELSEG_SIZE - ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ) ;
nblocks_this_segment = Min ( nblocks_this_segment , lengthof ( iov ) ) ;
iovcnt = buffers_to_iovec ( iov , buffers , nblocks_this_segment ) ;
size_this_segment = nblocks_this_segment * BLCKSZ ;
transferred_this_segment = 0 ;
/*
/*
* Short read : we are at or past EOF , or we read a partial block at
* Inner loop to continue after a short read . We ' ll keep going until
* EOF . Normally this is an error ; upper levels should never try to
* we hit EOF rather than assuming that a short read means we hit the
* read a nonexistent block . However , if zero_damaged_pages is ON or
* end .
* we are InRecovery , we should instead return zeroes without
* complaining . This allows , for example , the case of trying to
* update a block that was later truncated away .
*/
*/
if ( zero_damaged_pages | | InRecovery )
for ( ; ; )
MemSet ( buffer , 0 , BLCKSZ ) ;
{
else
TRACE_POSTGRESQL_SMGR_MD_READ_START ( forknum , blocknum ,
ereport ( ERROR ,
reln - > smgr_rlocator . locator . spcOid ,
( errcode ( ERRCODE_DATA_CORRUPTED ) ,
reln - > smgr_rlocator . locator . dbOid ,
errmsg ( " could not read block %u in file \" %s \" : read only %d of %d bytes " ,
reln - > smgr_rlocator . locator . relNumber ,
blocknum , FilePathName ( v - > mdfd_vfd ) ,
reln - > smgr_rlocator . backend ) ;
nbytes , BLCKSZ ) ) ) ;
nbytes = FileReadV ( v - > mdfd_vfd , iov , iovcnt , seekpos ,
WAIT_EVENT_DATA_FILE_READ ) ;
TRACE_POSTGRESQL_SMGR_MD_READ_DONE ( forknum , blocknum ,
reln - > smgr_rlocator . locator . spcOid ,
reln - > smgr_rlocator . locator . dbOid ,
reln - > smgr_rlocator . locator . relNumber ,
reln - > smgr_rlocator . backend ,
nbytes ,
size_this_segment - transferred_this_segment ) ;
# ifdef SIMULATE_SHORT_READ
nbytes = Min ( nbytes , 4096 ) ;
# endif
if ( nbytes < 0 )
ereport ( ERROR ,
( errcode_for_file_access ( ) ,
errmsg ( " could not read blocks %u..%u in file \" %s \" : %m " ,
blocknum ,
blocknum + nblocks_this_segment - 1 ,
FilePathName ( v - > mdfd_vfd ) ) ) ) ;
if ( nbytes = = 0 )
{
/*
* We are at or past EOF , or we read a partial block at EOF .
* Normally this is an error ; upper levels should never try to
* read a nonexistent block . However , if zero_damaged_pages
* is ON or we are InRecovery , we should instead return zeroes
* without complaining . This allows , for example , the case of
* trying to update a block that was later truncated away .
*/
if ( zero_damaged_pages | | InRecovery )
{
for ( BlockNumber i = transferred_this_segment / BLCKSZ ;
i < nblocks_this_segment ;
+ + i )
memset ( buffers [ i ] , 0 , BLCKSZ ) ;
break ;
}
else
ereport ( ERROR ,
( errcode ( ERRCODE_DATA_CORRUPTED ) ,
errmsg ( " could not read blocks %u..%u in file \" %s \" : read only %zu of %zu bytes " ,
blocknum ,
blocknum + nblocks_this_segment - 1 ,
FilePathName ( v - > mdfd_vfd ) ,
transferred_this_segment ,
size_this_segment ) ) ) ;
}
/* One loop should usually be enough. */
transferred_this_segment + = nbytes ;
Assert ( transferred_this_segment < = size_this_segment ) ;
if ( transferred_this_segment = = size_this_segment )
break ;
/* Adjust position and vectors after a short read. */
seekpos + = nbytes ;
iovcnt = compute_remaining_iovec ( iov , iov , iovcnt , nbytes ) ;
}
nblocks - = nblocks_this_segment ;
buffers + = nblocks_this_segment ;
blocknum + = nblocks_this_segment ;
}
}
}
}
/*
/*
* mdwrite ( ) - - Write the supplied block at the appropriate location .
* mdwritev ( ) - - Write the supplied blocks at the appropriate location .
*
*
* This is to be used only for updating already - existing blocks of a
* This is to be used only for updating already - existing blocks of a
* relation ( ie , those before the current EOF ) . To extend a relation ,
* relation ( ie , those before the current EOF ) . To extend a relation ,
* use mdextend ( ) .
* use mdextend ( ) .
*/
*/
void
void
mdwrite ( SMgrRelation reln , ForkNumber forknum , BlockNumber blocknum ,
mdwritev ( SMgrRelation reln , ForkNumber forknum , BlockNumber blocknum ,
const void * buffer , bool skipFsync )
const void * * buffers , BlockNumber nblocks , bool skipFsync )
{
{
off_t seekpos ;
int nbytes ;
MdfdVec * v ;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
if ( PG_O_DIRECT ! = 0 & & PG_IO_ALIGN_SIZE < = BLCKSZ )
Assert ( ( uintptr_t ) buffer = = TYPEALIGN ( PG_IO_ALIGN_SIZE , buffer ) ) ;
/* This assert is too expensive to have on normally ... */
/* This assert is too expensive to have on normally ... */
# ifdef CHECK_WRITE_VS_EXTEND
# ifdef CHECK_WRITE_VS_EXTEND
Assert ( blocknum < mdnblocks ( reln , forknum ) ) ;
Assert ( blocknum < mdnblocks ( reln , forknum ) ) ;
# endif
# endif
TRACE_POSTGRESQL_SMGR_MD_WRITE_START ( forknum , blocknum ,
while ( nblocks > 0 )
reln - > smgr_rlocator . locator . spcOid ,
{
reln - > smgr_rlocator . locator . dbOid ,
struct iovec iov [ PG_IOV_MAX ] ;
reln - > smgr_rlocator . locator . relNumber ,
int iovcnt ;
reln - > smgr_rlocator . backend ) ;
off_t seekpos ;
int nbytes ;
MdfdVec * v ;
BlockNumber nblocks_this_segment ;
size_t transferred_this_segment ;
size_t size_this_segment ;
v = _mdfd_getseg ( reln , forknum , blocknum , skipFsync ,
v = _mdfd_getseg ( reln , forknum , blocknum , skipFsync ,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY ) ;
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY ) ;
seekpos = ( off_t ) BLCKSZ * ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ;
seekpos = ( off_t ) BLCKSZ * ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ;
Assert ( seekpos < ( off_t ) BLCKSZ * RELSEG_SIZE ) ;
Assert ( seekpos < ( off_t ) BLCKSZ * RELSEG_SIZE ) ;
nbytes = FileWrite ( v - > mdfd_vfd , buffer , BLCKSZ , seekpos , WAIT_EVENT_DATA_FILE_WRITE ) ;
nblocks_this_segment =
Min ( nblocks ,
RELSEG_SIZE - ( blocknum % ( ( BlockNumber ) RELSEG_SIZE ) ) ) ;
nblocks_this_segment = Min ( nblocks_this_segment , lengthof ( iov ) ) ;
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE ( forknum , blocknum ,
iovcnt = buffers_to_iovec ( iov , ( void * * ) buffers , nblocks_this_segment ) ;
reln - > smgr_rlocator . locator . spcOid ,
size_this_segment = nblocks_this_segment * BLCKSZ ;
reln - > smgr_rlocator . locator . dbOid ,
transferred_this_segment = 0 ;
reln - > smgr_rlocator . locator . relNumber ,
reln - > smgr_rlocator . backend ,
nbytes ,
BLCKSZ ) ;
if ( nbytes ! = BLCKSZ )
/*
{
* Inner loop to continue after a short write . If the reason is that
if ( nbytes < 0 )
* we ' re out of disk space , a future attempt should get an ENOSPC
ereport ( ERROR ,
* error from the kernel .
( errcode_for_file_access ( ) ,
*/
errmsg ( " could not write block %u in file \" %s \" : %m " ,
for ( ; ; )
blocknum , FilePathName ( v - > mdfd_vfd ) ) ) ) ;
{
/* short write: complain appropriately */
TRACE_POSTGRESQL_SMGR_MD_WRITE_START ( forknum , blocknum ,
ereport ( ERROR ,
reln - > smgr_rlocator . locator . spcOid ,
( errcode ( ERRCODE_DISK_FULL ) ,
reln - > smgr_rlocator . locator . dbOid ,
errmsg ( " could not write block %u in file \" %s \" : wrote only %d of %d bytes " ,
reln - > smgr_rlocator . locator . relNumber ,
blocknum ,
reln - > smgr_rlocator . backend ) ;
FilePathName ( v - > mdfd_vfd ) ,
nbytes = FileWriteV ( v - > mdfd_vfd , iov , iovcnt , seekpos ,
nbytes , BLCKSZ ) ,
WAIT_EVENT_DATA_FILE_WRITE ) ;
errhint ( " Check free disk space. " ) ) ) ;
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE ( forknum , blocknum ,
}
reln - > smgr_rlocator . locator . spcOid ,
reln - > smgr_rlocator . locator . dbOid ,
reln - > smgr_rlocator . locator . relNumber ,
reln - > smgr_rlocator . backend ,
nbytes ,
size_this_segment - transferred_this_segment ) ;
# ifdef SIMULATE_SHORT_WRITE
nbytes = Min ( nbytes , 4096 ) ;
# endif
if ( ! skipFsync & & ! SmgrIsTemp ( reln ) )
if ( nbytes < 0 )
register_dirty_segment ( reln , forknum , v ) ;
{
bool enospc = errno = = ENOSPC ;
ereport ( ERROR ,
( errcode_for_file_access ( ) ,
errmsg ( " could not write blocks %u..%u in file \" %s \" : %m " ,
blocknum ,
blocknum + nblocks_this_segment - 1 ,
FilePathName ( v - > mdfd_vfd ) ) ,
enospc ? errhint ( " Check free disk space. " ) : 0 ) ) ;
}
/* One loop should usually be enough. */
transferred_this_segment + = nbytes ;
Assert ( transferred_this_segment < = size_this_segment ) ;
if ( transferred_this_segment = = size_this_segment )
break ;
/* Adjust position and iovecs after a short write. */
seekpos + = nbytes ;
iovcnt = compute_remaining_iovec ( iov , iov , iovcnt , nbytes ) ;
}
if ( ! skipFsync & & ! SmgrIsTemp ( reln ) )
register_dirty_segment ( reln , forknum , v ) ;
nblocks - = nblocks_this_segment ;
buffers + = nblocks_this_segment ;
blocknum + = nblocks_this_segment ;
}
}
}
/*
/*
* mdwriteback ( ) - - Tell the kernel to write pages back to storage .
* mdwriteback ( ) - - Tell the kernel to write pages back to storage .
*
*