|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* xlogutils.c
|
|
|
|
*
|
|
|
|
* PostgreSQL transaction log manager utility routines
|
|
|
|
*
|
|
|
|
* This file contains support routines that are used by XLOG replay functions.
|
|
|
|
* None of this code is used during normal system operation.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
|
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* src/backend/access/transam/xlogutils.c
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
25 years ago
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
XLOG (and related) changes:
* Store two past checkpoint locations, not just one, in pg_control.
On startup, we fall back to the older checkpoint if the newer one
is unreadable. Also, a physical copy of the newest checkpoint record
is kept in pg_control for possible use in disaster recovery (ie,
complete loss of pg_xlog). Also add a version number for pg_control
itself. Remove archdir from pg_control; it ought to be a GUC
parameter, not a special case (not that it's implemented yet anyway).
* Suppress successive checkpoint records when nothing has been entered
in the WAL log since the last one. This is not so much to avoid I/O
as to make it actually useful to keep track of the last two
checkpoints. If the things are right next to each other then there's
not a lot of redundancy gained...
* Change CRC scheme to a true 64-bit CRC, not a pair of 32-bit CRCs
on alternate bytes. Polynomial borrowed from ECMA DLT1 standard.
* Fix XLOG record length handling so that it will work at BLCKSZ = 32k.
* Change XID allocation to work more like OID allocation. (This is of
dubious necessity, but I think it's a good idea anyway.)
* Fix a number of minor bugs, such as off-by-one logic for XLOG file
wraparound at the 4 gig mark.
* Add documentation and clean up some coding infelicities; move file
format declarations out to include files where planned contrib
utilities can get at them.
* Checkpoint will now occur every CHECKPOINT_SEGMENTS log segments or
every CHECKPOINT_TIMEOUT seconds, whichever comes first. It is also
possible to force a checkpoint by sending SIGUSR1 to the postmaster
(undocumented feature...)
* Defend against kill -9 postmaster by storing shmem block's key and ID
in postmaster.pid lockfile, and checking at startup to ensure that no
processes are still connected to old shmem block (if it still exists).
* Switch backends to accept SIGQUIT rather than SIGUSR1 for emergency
stop, for symmetry with postmaster and xlog utilities. Clean up signal
handling in bootstrap.c so that xlog utilities launched by postmaster
will react to signals better.
* Standalone bootstrap now grabs lockfile in target directory, as added
insurance against running it in parallel with live postmaster.
25 years ago
|
|
|
|
|
|
|
#include "access/xlog.h"
|
|
|
|
#include "access/xlogutils.h"
|
|
|
|
#include "catalog/catalog.h"
|
|
|
|
#include "storage/smgr.h"
|
|
|
|
#include "utils/guc.h"
|
|
|
|
#include "utils/hsearch.h"
|
|
|
|
#include "utils/rel.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* During XLOG replay, we may see XLOG records for incremental updates of
|
|
|
|
* pages that no longer exist, because their relation was later dropped or
|
|
|
|
* truncated. (Note: this is only possible when full_page_writes = OFF,
|
|
|
|
* since when it's ON, the first reference we see to a page should always
|
|
|
|
* be a full-page rewrite not an incremental update.) Rather than simply
|
|
|
|
* ignoring such records, we make a note of the referenced page, and then
|
|
|
|
* complain if we don't actually see a drop or truncate covering the page
|
|
|
|
* later in replay.
|
|
|
|
*/
|
|
|
|
typedef struct xl_invalid_page_key
|
|
|
|
{
|
|
|
|
RelFileNode node; /* the relation */
|
|
|
|
ForkNumber forkno; /* the fork number */
|
|
|
|
BlockNumber blkno; /* the page */
|
|
|
|
} xl_invalid_page_key;
|
|
|
|
|
|
|
|
typedef struct xl_invalid_page
|
|
|
|
{
|
|
|
|
xl_invalid_page_key key; /* hash key ... must be first */
|
|
|
|
bool present; /* page existed but contained zeroes */
|
|
|
|
} xl_invalid_page;
|
|
|
|
|
|
|
|
static HTAB *invalid_page_tab = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
/* Report a reference to an invalid page */
|
|
|
|
static void
|
|
|
|
report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno,
|
|
|
|
BlockNumber blkno, bool present)
|
|
|
|
{
|
|
|
|
char *path = relpathperm(node, forkno);
|
|
|
|
|
|
|
|
if (present)
|
|
|
|
elog(elevel, "page %u of relation %s is uninitialized",
|
|
|
|
blkno, path);
|
|
|
|
else
|
|
|
|
elog(elevel, "page %u of relation %s does not exist",
|
|
|
|
blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Log a reference to an invalid page */
|
|
|
|
static void
|
|
|
|
log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno,
|
|
|
|
bool present)
|
|
|
|
{
|
|
|
|
xl_invalid_page_key key;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Once recovery has reached a consistent state, the invalid-page table
|
|
|
|
* should be empty and remain so. If a reference to an invalid page is
|
|
|
|
* found after consistency is reached, PANIC immediately. This might seem
|
|
|
|
* aggressive, but it's better than letting the invalid reference linger
|
|
|
|
* in the hash table until the end of recovery and PANIC there, which
|
|
|
|
* might come only much later if this is a standby server.
|
|
|
|
*/
|
|
|
|
if (reachedConsistency)
|
|
|
|
{
|
|
|
|
report_invalid_page(WARNING, node, forkno, blkno, present);
|
|
|
|
elog(PANIC, "WAL contains references to invalid pages");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log references to invalid pages at DEBUG1 level. This allows some
|
|
|
|
* tracing of the cause (note the elog context mechanism will tell us
|
|
|
|
* something about the XLOG record that generated the reference).
|
|
|
|
*/
|
|
|
|
if (log_min_messages <= DEBUG1 || client_min_messages <= DEBUG1)
|
|
|
|
report_invalid_page(DEBUG1, node, forkno, blkno, present);
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
{
|
|
|
|
/* create hash table when first needed */
|
|
|
|
HASHCTL ctl;
|
|
|
|
|
|
|
|
memset(&ctl, 0, sizeof(ctl));
|
|
|
|
ctl.keysize = sizeof(xl_invalid_page_key);
|
|
|
|
ctl.entrysize = sizeof(xl_invalid_page);
|
|
|
|
ctl.hash = tag_hash;
|
|
|
|
|
|
|
|
invalid_page_tab = hash_create("XLOG invalid-page table",
|
|
|
|
100,
|
|
|
|
&ctl,
|
|
|
|
HASH_ELEM | HASH_FUNCTION);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we currently assume xl_invalid_page_key contains no padding */
|
|
|
|
key.node = node;
|
|
|
|
key.forkno = forkno;
|
|
|
|
key.blkno = blkno;
|
|
|
|
hentry = (xl_invalid_page *)
|
|
|
|
hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
|
|
|
|
|
|
|
|
if (!found)
|
|
|
|
{
|
|
|
|
/* hash_search already filled in the key */
|
|
|
|
hentry->present = present;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* repeat reference ... leave "present" as it was */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Forget any invalid pages >= minblkno, because they've been dropped */
|
|
|
|
static void
|
|
|
|
forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
|
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
|
|
|
if (RelFileNodeEquals(hentry->key.node, node) &&
|
|
|
|
hentry->key.forkno == forkno &&
|
|
|
|
hentry->key.blkno >= minblkno)
|
|
|
|
{
|
|
|
|
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
|
|
|
|
{
|
|
|
|
char *path = relpathperm(hentry->key.node, forkno);
|
|
|
|
|
|
|
|
elog(DEBUG2, "page %u of relation %s has been dropped",
|
|
|
|
hentry->key.blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hash_search(invalid_page_tab,
|
|
|
|
(void *) &hentry->key,
|
|
|
|
HASH_REMOVE, NULL) == NULL)
|
|
|
|
elog(ERROR, "hash table corrupted");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Forget any invalid pages in a whole database */
|
|
|
|
static void
|
|
|
|
forget_invalid_pages_db(Oid dbid)
|
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
|
|
|
if (hentry->key.node.dbNode == dbid)
|
|
|
|
{
|
|
|
|
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
|
|
|
|
{
|
|
|
|
char *path = relpathperm(hentry->key.node, hentry->key.forkno);
|
|
|
|
|
|
|
|
elog(DEBUG2, "page %u of relation %s has been dropped",
|
|
|
|
hentry->key.blkno, path);
|
|
|
|
pfree(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hash_search(invalid_page_tab,
|
|
|
|
(void *) &hentry->key,
|
|
|
|
HASH_REMOVE, NULL) == NULL)
|
|
|
|
elog(ERROR, "hash table corrupted");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Are there any unresolved references to invalid pages? */
|
|
|
|
bool
|
|
|
|
XLogHaveInvalidPages(void)
|
|
|
|
{
|
|
|
|
if (invalid_page_tab != NULL &&
|
|
|
|
hash_get_num_entries(invalid_page_tab) > 0)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Complain about any remaining invalid-page entries */
|
|
|
|
void
|
|
|
|
XLogCheckInvalidPages(void)
|
|
|
|
{
|
|
|
|
HASH_SEQ_STATUS status;
|
|
|
|
xl_invalid_page *hentry;
|
|
|
|
bool foundone = false;
|
|
|
|
|
|
|
|
if (invalid_page_tab == NULL)
|
|
|
|
return; /* nothing to do */
|
|
|
|
|
|
|
|
hash_seq_init(&status, invalid_page_tab);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Our strategy is to emit WARNING messages for all remaining entries and
|
|
|
|
* only PANIC after we've dumped all the available info.
|
|
|
|
*/
|
|
|
|
while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
|
|
|
|
{
|
|
|
|
report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
|
|
|
|
hentry->key.blkno, hentry->present);
|
|
|
|
foundone = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (foundone)
|
|
|
|
elog(PANIC, "WAL contains references to invalid pages");
|
|
|
|
|
|
|
|
hash_destroy(invalid_page_tab);
|
|
|
|
invalid_page_tab = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XLogReadBufferForRedo
|
|
|
|
* Read a page during XLOG replay
|
|
|
|
*
|
|
|
|
* Reads a block referenced by a WAL record into shared buffer cache, and
|
|
|
|
* determines what needs to be done to redo the changes to it. If the WAL
|
|
|
|
* record includes a full-page image of the page, it is restored.
|
|
|
|
*
|
|
|
|
* 'lsn' is the LSN of the record being replayed. It is compared with the
|
|
|
|
* page's LSN to determine if the record has already been replayed.
|
|
|
|
* 'rnode' and 'blkno' point to the block being replayed (main fork number
|
|
|
|
* is implied, use XLogReadBufferForRedoExtended for other forks).
|
|
|
|
* 'block_index' identifies the backup block in the record for the page.
|
|
|
|
*
|
|
|
|
* Returns one of the following:
|
|
|
|
*
|
|
|
|
* BLK_NEEDS_REDO - changes from the WAL record need to be applied
|
|
|
|
* BLK_DONE - block doesn't need replaying
|
|
|
|
* BLK_RESTORED - block was restored from a full-page image included in
|
|
|
|
* the record
|
|
|
|
* BLK_NOTFOUND - block was not found (because it was truncated away by
|
|
|
|
* an operation later in the WAL stream)
|
|
|
|
*
|
|
|
|
* On return, the buffer is locked in exclusive-mode, and returned in *buf.
|
|
|
|
* Note that the buffer is locked and returned even if it doesn't need
|
|
|
|
* replaying. (Getting the buffer lock is not really necessary during
|
|
|
|
* single-process crash recovery, but some subroutines such as MarkBufferDirty
|
|
|
|
* will complain if we don't have the lock. In hot standby mode it's
|
|
|
|
* definitely necessary.)
|
|
|
|
*/
|
|
|
|
XLogRedoAction
|
|
|
|
XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index,
|
|
|
|
RelFileNode rnode, BlockNumber blkno,
|
|
|
|
Buffer *buf)
|
|
|
|
{
|
|
|
|
return XLogReadBufferForRedoExtended(lsn, record, block_index,
|
|
|
|
rnode, MAIN_FORKNUM, blkno,
|
|
|
|
RBM_NORMAL, false, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XLogReadBufferForRedoExtended
|
|
|
|
* Like XLogReadBufferForRedo, but with extra options.
|
|
|
|
*
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
|
|
|
* with all-zeroes pages up to the referenced block number. In
|
|
|
|
* RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
|
|
|
|
* is always BLK_NEEDS_REDO.
|
|
|
|
*
|
|
|
|
* (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
|
|
|
|
* parameter. Do not use an inconsistent combination!)
|
|
|
|
*
|
|
|
|
* If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
|
|
|
|
* using LockBufferForCleanup(), instead of a regular exclusive lock.
|
|
|
|
*/
|
|
|
|
XLogRedoAction
|
|
|
|
XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record,
|
|
|
|
int block_index, RelFileNode rnode,
|
|
|
|
ForkNumber forkno, BlockNumber blkno,
|
|
|
|
ReadBufferMode mode, bool get_cleanup_lock,
|
|
|
|
Buffer *buf)
|
|
|
|
{
|
|
|
|
if (record->xl_info & XLR_BKP_BLOCK(block_index))
|
|
|
|
{
|
|
|
|
*buf = RestoreBackupBlock(lsn, record, block_index,
|
|
|
|
get_cleanup_lock, true);
|
|
|
|
return BLK_RESTORED;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*buf = XLogReadBufferExtended(rnode, forkno, blkno, mode);
|
|
|
|
if (BufferIsValid(*buf))
|
|
|
|
{
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
{
|
|
|
|
if (get_cleanup_lock)
|
|
|
|
LockBufferForCleanup(*buf);
|
|
|
|
else
|
|
|
|
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
}
|
|
|
|
if (lsn <= PageGetLSN(BufferGetPage(*buf)))
|
|
|
|
return BLK_DONE;
|
|
|
|
else
|
|
|
|
return BLK_NEEDS_REDO;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return BLK_NOTFOUND;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
/*
|
|
|
|
* XLogReadBuffer
|
|
|
|
* Read a page during XLOG replay.
|
|
|
|
*
|
|
|
|
* This is a shorthand of XLogReadBufferExtended() followed by
|
|
|
|
* LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main
|
|
|
|
* fork.
|
|
|
|
*
|
|
|
|
* (Getting the buffer lock is not really necessary during single-process
|
|
|
|
* crash recovery, but some subroutines such as MarkBufferDirty will complain
|
|
|
|
* if we don't have the lock. In hot standby mode it's definitely necessary.)
|
|
|
|
*
|
|
|
|
* The returned buffer is exclusively-locked.
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
*
|
|
|
|
* For historical reasons, instead of a ReadBufferMode argument, this only
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
* supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
|
|
|
|
* modes.
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
|
|
|
|
{
|
|
|
|
Buffer buf;
|
|
|
|
|
|
|
|
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
|
|
|
|
if (BufferIsValid(buf) && !init)
|
|
|
|
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
|
|
|
|
return buf;
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XLogReadBufferExtended
|
|
|
|
* Read a page during XLOG replay
|
|
|
|
*
|
|
|
|
* This is functionally comparable to ReadBufferExtended. There's some
|
|
|
|
* differences in the behavior wrt. the "mode" argument:
|
|
|
|
*
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
* In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
|
|
|
|
* return InvalidBuffer. In this case the caller should silently skip the
|
|
|
|
* update on this page. (In this situation, we expect that the page was later
|
|
|
|
* dropped or truncated. If we don't see evidence of that later in the WAL
|
|
|
|
* sequence, we'll complain at the end of WAL replay.)
|
|
|
|
*
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
|
|
|
|
* with all-zeroes pages up to the given block number.
|
Fix multiple bugs in index page locking during hot-standby WAL replay.
In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted. (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.) In Hot Standby,
the WAL replay process must likewise lock every leaf page. There were
several bugs in the code for that:
* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error. This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages". To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.
* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about). Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.
* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page. However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.
The first two of these bugs were diagnosed by Andres Freund, the other one
by me. Fixes based on ideas from Heikki Linnakangas and myself.
This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
12 years ago
|
|
|
*
|
|
|
|
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
|
|
|
|
* exist, and we don't check for all-zeroes. Thus, no log entry is made
|
|
|
|
* to imply that the page should be dropped or truncated later.
|
|
|
|
*/
|
|
|
|
Buffer
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
|
|
|
|
BlockNumber blkno, ReadBufferMode mode)
|
|
|
|
{
|
|
|
|
BlockNumber lastblock;
|
|
|
|
Buffer buffer;
|
|
|
|
SMgrRelation smgr;
|
|
|
|
|
|
|
|
Assert(blkno != P_NEW);
|
|
|
|
|
|
|
|
/* Open the relation at smgr level */
|
|
|
|
smgr = smgropen(rnode, InvalidBackendId);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create the target file if it doesn't already exist. This lets us cope
|
|
|
|
* if the replay sequence contains writes to a relation that is later
|
|
|
|
* deleted. (The original coding of this routine would instead suppress
|
|
|
|
* the writes, but that seems like it risks losing valuable data if the
|
|
|
|
* filesystem loses an inode during a crash. Better to write the data
|
|
|
|
* until we are actually told to delete the file.)
|
|
|
|
*/
|
|
|
|
smgrcreate(smgr, forknum, true);
|
|
|
|
|
|
|
|
lastblock = smgrnblocks(smgr, forknum);
|
|
|
|
|
|
|
|
if (blkno < lastblock)
|
|
|
|
{
|
|
|
|
/* page exists in file */
|
|
|
|
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
mode, NULL);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* hm, page doesn't exist in file */
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
if (mode == RBM_NORMAL)
|
|
|
|
{
|
|
|
|
log_invalid_page(rnode, forknum, blkno, false);
|
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
Fix multiple bugs in index page locking during hot-standby WAL replay.
In ordinary operation, VACUUM must be careful to take a cleanup lock on
each leaf page of a btree index; this ensures that no indexscans could
still be "in flight" to heap tuples due to be deleted. (Because of
possible index-tuple motion due to concurrent page splits, it's not enough
to lock only the pages we're deleting index tuples from.) In Hot Standby,
the WAL replay process must likewise lock every leaf page. There were
several bugs in the code for that:
* The replay scan might come across unused, all-zero pages in the index.
While btree_xlog_vacuum itself did the right thing (ie, nothing) with
such pages, xlogutils.c supposed that such pages must be corrupt and
would throw an error. This accounts for various reports of replication
failures with "PANIC: WAL contains references to invalid pages". To
fix, add a ReadBufferMode value that instructs XLogReadBufferExtended
not to complain when we're doing this.
* btree_xlog_vacuum performed the extra locking if standbyState ==
STANDBY_SNAPSHOT_READY, but that's not the correct test: we won't open up
for hot standby queries until the database has reached consistency, and
we don't want to do the extra locking till then either, for fear of reading
corrupted pages (which bufmgr.c would complain about). Fix by exporting a
new function from xlog.c that will report whether we're actually in hot
standby replay mode.
* To ensure full coverage of the index in the replay scan, btvacuumscan
would emit a dummy WAL record for the last page of the index, if no
vacuuming work had been done on that page. However, if the last page
of the index is all-zero, that would result in corruption of said page,
since the functions called on it weren't prepared to handle that case.
There's no need to lock any such pages, so change the logic to target
the last normal leaf page instead.
The first two of these bugs were diagnosed by Andres Freund, the other one
by me. Fixes based on ideas from Heikki Linnakangas and myself.
This has been wrong since Hot Standby was introduced, so back-patch to 9.0.
12 years ago
|
|
|
if (mode == RBM_NORMAL_NO_LOG)
|
|
|
|
return InvalidBuffer;
|
|
|
|
/* OK to extend the file */
|
|
|
|
/* we do this in recovery only - no rel-extension lock needed */
|
|
|
|
Assert(InRecovery);
|
|
|
|
buffer = InvalidBuffer;
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
12 years ago
|
|
|
do
|
|
|
|
{
|
|
|
|
if (buffer != InvalidBuffer)
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
{
|
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
|
|
ReleaseBuffer(buffer);
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
}
|
|
|
|
buffer = ReadBufferWithoutRelcache(rnode, forknum,
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
P_NEW, mode, NULL);
|
|
|
|
}
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
12 years ago
|
|
|
while (BufferGetBlockNumber(buffer) < blkno);
|
|
|
|
/* Handle the corner case that P_NEW returns non-consecutive pages */
|
|
|
|
if (BufferGetBlockNumber(buffer) != blkno)
|
|
|
|
{
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
In XLogReadBufferExtended, don't assume P_NEW yields consecutive pages.
In a database that's not yet reached consistency, it's possible that some
segments of a relation are not full-size but are not the last ones either.
Because of the way smgrnblocks() works, asking for a new page with P_NEW
will fill in the last not-full-size segment --- and if that makes it full
size, the apparent EOF of the relation will increase by more than one page,
so that the next P_NEW request will yield a page past the next consecutive
one. This breaks the relation-extension logic in XLogReadBufferExtended,
possibly allowing a page update to be applied to some page far past where
it was intended to go. This appears to be the explanation for reports of
table bloat on replication slaves compared to their masters, and probably
explains some corrupted-slave reports as well.
Fix the loop to check the page number it actually got, rather than merely
Assert()'ing that dead reckoning got it to the desired place. AFAICT,
there are no other places that make assumptions about exactly which page
they'll get from P_NEW.
Problem identified by Greg Stark, though this is not the same as his
proposed patch.
It's been like this for a long time, so back-patch to all supported
branches.
12 years ago
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
|
|
|
|
mode, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Unite ReadBufferWithFork, ReadBufferWithStrategy, and ZeroOrReadBuffer
functions into one ReadBufferExtended function, that takes the strategy
and mode as argument. There's three modes, RBM_NORMAL which is the default
used by plain ReadBuffer(), RBM_ZERO, which replaces ZeroOrReadBuffer, and
a new mode RBM_ZERO_ON_ERROR, which allows callers to read corrupt pages
without throwing an error. The FSM needs the new mode to recover from
corrupt pages, which could happend if we crash after extending an FSM file,
and the new page is "torn".
Add fork number to some error messages in bufmgr.c, that still lacked it.
17 years ago
|
|
|
if (mode == RBM_NORMAL)
|
|
|
|
{
|
|
|
|
/* check that page has been initialized */
|
|
|
|
Page page = (Page) BufferGetPage(buffer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We assume that PageIsNew is safe without a lock. During recovery,
|
|
|
|
* there should be no other backends that could modify the buffer at
|
|
|
|
* the same time.
|
|
|
|
*/
|
|
|
|
if (PageIsNew(page))
|
|
|
|
{
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
log_invalid_page(rnode, forknum, blkno, true);
|
|
|
|
return InvalidBuffer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Restore a full-page image from a backup block attached to an XLOG record.
|
|
|
|
*
|
|
|
|
* lsn: LSN of the XLOG record being replayed
|
|
|
|
* record: the complete XLOG record
|
|
|
|
* block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
|
|
|
|
* get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
|
|
|
|
* keep_buffer: TRUE to return the buffer still locked and pinned
|
|
|
|
*
|
|
|
|
* Returns the buffer number containing the page. Note this is not terribly
|
|
|
|
* useful unless keep_buffer is specified as TRUE.
|
|
|
|
*
|
|
|
|
* Note: when a backup block is available in XLOG, we restore it
|
|
|
|
* unconditionally, even if the page in the database appears newer.
|
|
|
|
* This is to protect ourselves against database pages that were partially
|
|
|
|
* or incorrectly written during a crash. We assume that the XLOG data
|
|
|
|
* must be good because it has passed a CRC check, while the database
|
|
|
|
* page might not be. This will force us to replay all subsequent
|
|
|
|
* modifications of the page that appear in XLOG, rather than possibly
|
|
|
|
* ignoring them as already applied, but that's not a huge drawback.
|
|
|
|
*
|
|
|
|
* If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
|
|
|
|
* else a normal exclusive lock is used. During crash recovery, that's just
|
|
|
|
* pro forma because there can't be any regular backends in the system, but
|
|
|
|
* in hot standby mode the distinction is important.
|
|
|
|
*
|
|
|
|
* If 'keep_buffer' is true, return without releasing the buffer lock and pin;
|
|
|
|
* then caller is responsible for doing UnlockReleaseBuffer() later. This
|
|
|
|
* is needed in some cases when replaying XLOG records that touch multiple
|
|
|
|
* pages, to prevent inconsistent states from being visible to other backends.
|
|
|
|
* (Again, that's only important in hot standby mode.)
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
|
|
|
|
bool get_cleanup_lock, bool keep_buffer)
|
|
|
|
{
|
|
|
|
BkpBlock bkpb;
|
|
|
|
char *blk;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Locate requested BkpBlock in the record */
|
|
|
|
blk = (char *) XLogRecGetData(record) + record->xl_len;
|
|
|
|
for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
|
|
|
|
{
|
|
|
|
if (!(record->xl_info & XLR_BKP_BLOCK(i)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
memcpy(&bkpb, blk, sizeof(BkpBlock));
|
|
|
|
blk += sizeof(BkpBlock);
|
|
|
|
|
|
|
|
if (i == block_index)
|
|
|
|
{
|
|
|
|
/* Found it, apply the update */
|
|
|
|
return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
|
|
|
|
keep_buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
blk += BLCKSZ - bkpb.hole_length;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Caller specified a bogus block_index */
|
|
|
|
elog(ERROR, "failed to restore block_index %d", block_index);
|
|
|
|
return InvalidBuffer; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Workhorse for RestoreBackupBlock usable without an xlog record
|
|
|
|
*
|
|
|
|
* Restores a full-page image from BkpBlock and a data pointer.
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
|
|
|
|
bool get_cleanup_lock, bool keep_buffer)
|
|
|
|
{
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
|
|
|
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
|
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out,
but not yet locked. If a backend pinned and locked the page in that window,
it saw the zeroed page instead of the old page or new page contents, which
could lead to missing rows in a result set, or errors.
To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins,
zeroes, and locks the page, if it's not in the buffer cache already.
In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE,
to avoid breaking any 3rd party extensions that might use RBM_ZERO. More
importantly, this avoids renumbering the other enum values, which would
cause even bigger confusion in extensions that use ReadBufferExtended, but
haven't been recompiled.
Backpatch to all supported versions; this has been racy since hot standby
was introduced.
11 years ago
|
|
|
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
|
|
|
|
Assert(BufferIsValid(buffer));
|
|
|
|
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
|
|
|
if (bkpb.hole_length == 0)
|
|
|
|
{
|
|
|
|
memcpy((char *) page, blk, BLCKSZ);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
memcpy((char *) page, blk, bkpb.hole_offset);
|
|
|
|
/* must zero-fill the hole */
|
|
|
|
MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
|
|
|
|
memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
|
|
|
|
blk + bkpb.hole_offset,
|
|
|
|
BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The checksum value on this page is currently invalid. We don't need to
|
|
|
|
* reset it here since it will be set before being written.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The page may be uninitialized. If so, we can't set the LSN because that
|
|
|
|
* would corrupt the page.
|
|
|
|
*/
|
|
|
|
if (!PageIsNew(page))
|
|
|
|
{
|
|
|
|
PageSetLSN(page, lsn);
|
|
|
|
}
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
|
|
|
if (!keep_buffer)
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Struct actually returned by XLogFakeRelcacheEntry, though the declared
|
|
|
|
* return type is Relation.
|
|
|
|
*/
|
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
RelationData reldata; /* Note: this must be first */
|
|
|
|
FormData_pg_class pgc;
|
|
|
|
} FakeRelCacheEntryData;
|
|
|
|
|
|
|
|
typedef FakeRelCacheEntryData *FakeRelCacheEntry;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a fake relation cache entry for a physical relation
|
|
|
|
*
|
|
|
|
* It's often convenient to use the same functions in XLOG replay as in the
|
|
|
|
* main codepath, but those functions typically work with a relcache entry.
|
|
|
|
* We don't have a working relation cache during XLOG replay, but this
|
|
|
|
* function can be used to create a fake relcache entry instead. Only the
|
|
|
|
* fields related to physical storage, like rd_rel, are initialized, so the
|
|
|
|
* fake entry is only usable in low-level operations like ReadBuffer().
|
|
|
|
*
|
|
|
|
* Caller must free the returned entry with FreeFakeRelcacheEntry().
|
|
|
|
*/
|
|
|
|
Relation
|
|
|
|
CreateFakeRelcacheEntry(RelFileNode rnode)
|
|
|
|
{
|
|
|
|
FakeRelCacheEntry fakeentry;
|
|
|
|
Relation rel;
|
|
|
|
|
|
|
|
Assert(InRecovery);
|
|
|
|
|
|
|
|
/* Allocate the Relation struct and all related space in one block. */
|
|
|
|
fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
|
|
|
|
rel = (Relation) fakeentry;
|
|
|
|
|
|
|
|
rel->rd_rel = &fakeentry->pgc;
|
|
|
|
rel->rd_node = rnode;
|
|
|
|
/* We will never be working with temp rels during recovery */
|
|
|
|
rel->rd_backend = InvalidBackendId;
|
|
|
|
|
|
|
|
/* It must be a permanent table if we're in recovery. */
|
|
|
|
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
|
|
|
|
|
|
|
|
/* We don't know the name of the relation; use relfilenode instead */
|
|
|
|
sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We set up the lockRelId in case anything tries to lock the dummy
|
|
|
|
* relation. Note that this is fairly bogus since relNode may be
|
|
|
|
* different from the relation's OID. It shouldn't really matter though,
|
|
|
|
* since we are presumably running by ourselves and can't have any lock
|
|
|
|
* conflicts ...
|
|
|
|
*/
|
|
|
|
rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
|
|
|
|
rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
|
|
|
|
|
|
|
|
rel->rd_smgr = NULL;
|
|
|
|
|
|
|
|
return rel;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a fake relation cache entry.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
FreeFakeRelcacheEntry(Relation fakerel)
|
|
|
|
{
|
|
|
|
/* make sure the fakerel is not referenced by the SmgrRelation anymore */
|
|
|
|
if (fakerel->rd_smgr != NULL)
|
|
|
|
smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
|
|
|
|
pfree(fakerel);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop a relation during XLOG replay
|
|
|
|
*
|
|
|
|
* This is called when the relation is about to be deleted; we need to remove
|
|
|
|
* any open "invalid-page" records for the relation.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
|
|
|
|
{
|
|
|
|
forget_invalid_pages(rnode, forknum, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop a whole database during XLOG replay
|
|
|
|
*
|
|
|
|
* As above, but for DROP DATABASE instead of dropping a single rel
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
XLogDropDatabase(Oid dbid)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is unnecessarily heavy-handed, as it will close SMgrRelation
|
|
|
|
* objects for other databases as well. DROP DATABASE occurs seldom enough
|
|
|
|
* that it's not worth introducing a variant of smgrclose for just this
|
|
|
|
* purpose. XXX: Or should we rather leave the smgr entries dangling?
|
|
|
|
*/
|
|
|
|
smgrcloseall();
|
|
|
|
|
|
|
|
forget_invalid_pages_db(dbid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Truncate a relation during XLOG replay
|
|
|
|
*
|
|
|
|
* We need to clean up any open "invalid-page" records for the dropped pages.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
|
|
|
|
BlockNumber nblocks)
|
|
|
|
{
|
|
|
|
forget_invalid_pages(rnode, forkNum, nblocks);
|
|
|
|
}
|