|
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
|
*
|
|
|
|
|
* checkpointer.c
|
|
|
|
|
*
|
|
|
|
|
* The checkpointer is new as of Postgres 9.2. It handles all checkpoints.
|
|
|
|
|
* Checkpoints are automatically dispatched after a certain amount of time has
|
|
|
|
|
* elapsed since the last one, and it can be signaled to perform requested
|
|
|
|
|
* checkpoints as well. (The GUC parameter that mandates a checkpoint every
|
|
|
|
|
* so many WAL segments is implemented by having backends signal when they
|
|
|
|
|
* fill WAL segments; the checkpointer itself doesn't watch for the
|
|
|
|
|
* condition.)
|
|
|
|
|
*
|
|
|
|
|
* The normal termination sequence is that checkpointer is instructed to
|
|
|
|
|
* execute the shutdown checkpoint by SIGINT. After that checkpointer waits
|
|
|
|
|
* to be terminated via SIGUSR2, which instructs the checkpointer to exit(0).
|
|
|
|
|
* All backends must be stopped before SIGINT or SIGUSR2 is issued!
|
|
|
|
|
*
|
|
|
|
|
* Emergency termination is by SIGQUIT; like any backend, the checkpointer
|
|
|
|
|
* will simply abort and exit on SIGQUIT.
|
|
|
|
|
*
|
|
|
|
|
* If the checkpointer exits unexpectedly, the postmaster treats that the same
|
|
|
|
|
* as a backend crash: shared memory may be corrupted, so remaining backends
|
|
|
|
|
* should be killed by SIGQUIT and then a recovery cycle started. (Even if
|
|
|
|
|
* shared memory isn't corrupted, we have lost information about which
|
|
|
|
|
* files need to be fsync'd for the next checkpoint, and so a system
|
|
|
|
|
* restart needs to be forced.)
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
|
|
|
*
|
|
|
|
|
*
|
|
|
|
|
* IDENTIFICATION
|
|
|
|
|
* src/backend/postmaster/checkpointer.c
|
|
|
|
|
*
|
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
|
*/
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
|
|
#include <sys/time.h>
|
|
|
|
|
#include <time.h>
|
|
|
|
|
|
|
|
|
|
#include "access/xlog.h"
|
|
|
|
|
#include "access/xlog_internal.h"
|
|
|
|
|
#include "access/xlogrecovery.h"
|
|
|
|
|
#include "libpq/pqsignal.h"
|
|
|
|
|
#include "miscadmin.h"
|
|
|
|
|
#include "pgstat.h"
|
|
|
|
|
#include "postmaster/auxprocess.h"
|
|
|
|
|
#include "postmaster/bgwriter.h"
|
|
|
|
|
#include "postmaster/interrupt.h"
|
|
|
|
|
#include "replication/syncrep.h"
|
|
|
|
|
#include "storage/bufmgr.h"
|
|
|
|
|
#include "storage/condition_variable.h"
|
|
|
|
|
#include "storage/fd.h"
|
|
|
|
|
#include "storage/ipc.h"
|
|
|
|
|
#include "storage/lwlock.h"
|
|
|
|
|
#include "storage/pmsignal.h"
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
#include "storage/proc.h"
|
|
|
|
|
#include "storage/procsignal.h"
|
|
|
|
|
#include "storage/shmem.h"
|
|
|
|
|
#include "storage/smgr.h"
|
|
|
|
|
#include "storage/spin.h"
|
|
|
|
|
#include "utils/guc.h"
|
|
|
|
|
#include "utils/memutils.h"
|
|
|
|
|
#include "utils/resowner.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*----------
|
|
|
|
|
* Shared memory area for communication between checkpointer and backends
|
|
|
|
|
*
|
|
|
|
|
* The ckpt counters allow backends to watch for completion of a checkpoint
|
|
|
|
|
* request they send. Here's how it works:
|
|
|
|
|
* * At start of a checkpoint, checkpointer reads (and clears) the request
|
|
|
|
|
* flags and increments ckpt_started, while holding ckpt_lck.
|
|
|
|
|
* * On completion of a checkpoint, checkpointer sets ckpt_done to
|
|
|
|
|
* equal ckpt_started.
|
|
|
|
|
* * On failure of a checkpoint, checkpointer increments ckpt_failed
|
|
|
|
|
* and sets ckpt_done to equal ckpt_started.
|
|
|
|
|
*
|
|
|
|
|
* The algorithm for backends is:
|
|
|
|
|
* 1. Record current values of ckpt_failed and ckpt_started, and
|
|
|
|
|
* set request flags, while holding ckpt_lck.
|
|
|
|
|
* 2. Send signal to request checkpoint.
|
|
|
|
|
* 3. Sleep until ckpt_started changes. Now you know a checkpoint has
|
|
|
|
|
* begun since you started this algorithm (although *not* that it was
|
|
|
|
|
* specifically initiated by your signal), and that it is using your flags.
|
|
|
|
|
* 4. Record new value of ckpt_started.
|
|
|
|
|
* 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo
|
|
|
|
|
* arithmetic here in case counters wrap around.) Now you know a
|
|
|
|
|
* checkpoint has started and completed, but not whether it was
|
|
|
|
|
* successful.
|
|
|
|
|
* 6. If ckpt_failed is different from the originally saved value,
|
|
|
|
|
* assume request failed; otherwise it was definitely successful.
|
|
|
|
|
*
|
|
|
|
|
* ckpt_flags holds the OR of the checkpoint request flags sent by all
|
|
|
|
|
* requesting backends since the last checkpoint start. The flags are
|
|
|
|
|
* chosen so that OR'ing is the correct way to combine multiple requests.
|
|
|
|
|
*
|
|
|
|
|
* The requests array holds fsync requests sent by backends and not yet
|
|
|
|
|
* absorbed by the checkpointer.
|
|
|
|
|
*
|
|
|
|
|
* Unlike the checkpoint fields, requests related fields are protected by
|
|
|
|
|
* CheckpointerCommLock.
|
|
|
|
|
*----------
|
|
|
|
|
*/
|
|
|
|
|
typedef struct
|
|
|
|
|
{
|
|
|
|
|
SyncRequestType type; /* request type */
|
|
|
|
|
FileTag ftag; /* file identifier */
|
|
|
|
|
} CheckpointerRequest;
|
|
|
|
|
|
|
|
|
|
typedef struct
|
|
|
|
|
{
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
pid_t checkpointer_pid; /* PID (0 if not started) */
|
|
|
|
|
|
|
|
|
|
slock_t ckpt_lck; /* protects all the ckpt_* fields */
|
|
|
|
|
|
|
|
|
|
int ckpt_started; /* advances when checkpoint starts */
|
|
|
|
|
int ckpt_done; /* advances when checkpoint done */
|
|
|
|
|
int ckpt_failed; /* advances when checkpoint fails */
|
|
|
|
|
|
|
|
|
|
int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
|
|
|
|
|
|
|
|
|
|
ConditionVariable start_cv; /* signaled when ckpt_started advances */
|
|
|
|
|
ConditionVariable done_cv; /* signaled when ckpt_done advances */
|
|
|
|
|
|
|
|
|
|
int num_requests; /* current # of requests */
|
|
|
|
|
int max_requests; /* allocated array size */
|
|
|
|
|
CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
|
|
|
|
|
} CheckpointerShmemStruct;
|
|
|
|
|
|
|
|
|
|
static CheckpointerShmemStruct *CheckpointerShmem;
|
|
|
|
|
|
|
|
|
|
/* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */
|
|
|
|
|
#define WRITES_PER_ABSORB 1000
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* GUC parameters
|
|
|
|
|
*/
|
|
|
|
|
int CheckPointTimeout = 300;
|
|
|
|
|
int CheckPointWarning = 30;
|
|
|
|
|
double CheckPointCompletionTarget = 0.9;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Private state
|
|
|
|
|
*/
|
|
|
|
|
static bool ckpt_active = false;
|
|
|
|
|
static volatile sig_atomic_t ShutdownXLOGPending = false;
|
|
|
|
|
|
|
|
|
|
/* these values are valid when ckpt_active is true: */
|
|
|
|
|
static pg_time_t ckpt_start_time;
|
|
|
|
|
static XLogRecPtr ckpt_start_recptr;
|
|
|
|
|
static double ckpt_cached_elapsed;
|
|
|
|
|
|
|
|
|
|
static pg_time_t last_checkpoint_time;
|
|
|
|
|
static pg_time_t last_xlog_switch_time;
|
|
|
|
|
|
|
|
|
|
/* Prototypes for private functions */
|
|
|
|
|
|
|
|
|
|
static void HandleCheckpointerInterrupts(void);
|
|
|
|
|
static void CheckArchiveTimeout(void);
|
|
|
|
|
static bool IsCheckpointOnSchedule(double progress);
|
|
|
|
|
static bool ImmediateCheckpointRequested(void);
|
|
|
|
|
static bool CompactCheckpointerRequestQueue(void);
|
|
|
|
|
static void UpdateSharedMemoryConfig(void);
|
|
|
|
|
|
|
|
|
|
/* Signal handlers */
|
|
|
|
|
static void ReqShutdownXLOG(SIGNAL_ARGS);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Main entry point for checkpointer process
|
|
|
|
|
*
|
Fix management of pendingOpsTable in auxiliary processes.
mdinit() was misusing IsBootstrapProcessingMode() to decide whether to
create an fsync pending-operations table in the current process. This led
to creating a table not only in the startup and checkpointer processes as
intended, but also in the bgwriter process, not to mention other auxiliary
processes such as walwriter and walreceiver. Creation of the table in the
bgwriter is fatal, because it absorbs fsync requests that should have gone
to the checkpointer; instead they just sit in bgwriter local memory and are
never acted on. So writes performed by the bgwriter were not being fsync'd
which could result in data loss after an OS crash. I think there is no
live bug with respect to walwriter and walreceiver because those never
perform any writes of shared buffers; but the potential is there for
future breakage in those processes too.
To fix, make AuxiliaryProcessMain() export the current process's
AuxProcType as a global variable, and then make mdinit() test directly for
the types of aux process that should have a pendingOpsTable. Having done
that, we might as well also get rid of the random bool flags such as
am_walreceiver that some of the aux processes had grown. (Note that we
could not have fixed the bug by examining those variables in mdinit(),
because it's called from BaseInit() which is run by AuxiliaryProcessMain()
before entering any of the process-type-specific code.)
Back-patch to 9.2, where the problem was introduced by the split-up of
bgwriter and checkpointer processes. The bogus pendingOpsTable exists
in walwriter and walreceiver processes in earlier branches, but absent
any evidence that it causes actual problems there, I'll leave the older
branches alone.
14 years ago
|
|
|
* This is invoked from AuxiliaryProcessMain, which has already created the
|
|
|
|
|
* basic execution environment, but not enabled signals yet.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
CheckpointerMain(char *startup_data, size_t startup_data_len)
|
|
|
|
|
{
|
|
|
|
|
sigjmp_buf local_sigjmp_buf;
|
|
|
|
|
MemoryContext checkpointer_context;
|
|
|
|
|
|
|
|
|
|
Assert(startup_data_len == 0);
|
|
|
|
|
|
|
|
|
|
MyBackendType = B_CHECKPOINTER;
|
|
|
|
|
AuxiliaryProcessMainCommon();
|
|
|
|
|
|
|
|
|
|
CheckpointerShmem->checkpointer_pid = MyProcPid;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Properly accept or ignore signals the postmaster might send us
|
|
|
|
|
*
|
|
|
|
|
* Note: we deliberately ignore SIGTERM, because during a standard Unix
|
|
|
|
|
* system shutdown cycle, init will SIGTERM all processes at once. We
|
|
|
|
|
* want to wait for the backends to exit, whereupon the postmaster will
|
|
|
|
|
* tell us it's okay to shut down (via SIGUSR2).
|
|
|
|
|
*/
|
|
|
|
|
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
|
|
|
|
pqsignal(SIGINT, ReqShutdownXLOG);
|
|
|
|
|
pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
|
Centralize setup of SIGQUIT handling for postmaster child processes.
We decided that the policy established in commit 7634bd4f6 for
the bgwriter, checkpointer, walwriter, and walreceiver processes,
namely that they should accept SIGQUIT at all times, really ought
to apply uniformly to all postmaster children. Therefore, get
rid of the duplicative and inconsistent per-process code for
establishing that signal handler and removing SIGQUIT from BlockSig.
Instead, make InitPostmasterChild do it.
The handler set up by InitPostmasterChild is SignalHandlerForCrashExit,
which just summarily does _exit(2). In interactive backends, we
almost immediately replace that with quickdie, since we would prefer
to try to tell the client that we're dying. However, this patch is
changing the behavior of autovacuum (both launcher and workers), as
well as walsenders. Those processes formerly also used quickdie,
but AFAICS that was just mindless copy-and-paste: they don't have
any interactive client that's likely to benefit from being told this.
The stats collector continues to be an outlier, in that it thinks
SIGQUIT means normal exit. That should probably be changed for
consistency, but there's another patch set where that's being
dealt with, so I didn't do so here.
Discussion: https://postgr.es/m/644875.1599933441@sss.pgh.pa.us
5 years ago
|
|
|
/* SIGQUIT handler was already set up by InitPostmasterChild */
|
|
|
|
|
pqsignal(SIGALRM, SIG_IGN);
|
|
|
|
|
pqsignal(SIGPIPE, SIG_IGN);
|
|
|
|
|
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
|
|
|
|
pqsignal(SIGUSR2, SignalHandlerForShutdownRequest);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Reset some signals that are accepted by postmaster but not here
|
|
|
|
|
*/
|
|
|
|
|
pqsignal(SIGCHLD, SIG_DFL);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Initialize so that first time-driven event happens at the correct time.
|
|
|
|
|
*/
|
|
|
|
|
last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Write out stats after shutdown. This needs to be called by exactly one
|
|
|
|
|
* process during a normal shutdown, and since checkpointer is shut down
|
|
|
|
|
* very late...
|
|
|
|
|
*
|
|
|
|
|
* While e.g. walsenders are active after the shutdown checkpoint has been
|
|
|
|
|
* written (and thus could produce more stats), checkpointer stays around
|
|
|
|
|
* after the shutdown checkpoint has been written. postmaster will only
|
|
|
|
|
* signal checkpointer to exit after all processes that could emit stats
|
|
|
|
|
* have been shut down.
|
|
|
|
|
*/
|
|
|
|
|
before_shmem_exit(pgstat_before_server_shutdown, 0);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Create a memory context that we will do all our work in. We do this so
|
|
|
|
|
* that we can reset the context during error recovery and thereby avoid
|
|
|
|
|
* possible memory leaks. Formerly this code just ran in
|
|
|
|
|
* TopMemoryContext, but resetting that would be a really bad idea.
|
|
|
|
|
*/
|
|
|
|
|
checkpointer_context = AllocSetContextCreate(TopMemoryContext,
|
|
|
|
|
"Checkpointer",
|
Add macros to make AllocSetContextCreate() calls simpler and safer.
I found that half a dozen (nearly 5%) of our AllocSetContextCreate calls
had typos in the context-sizing parameters. While none of these led to
especially significant problems, they did create minor inefficiencies,
and it's now clear that expecting people to copy-and-paste those calls
accurately is not a great idea. Let's reduce the risk of future errors
by introducing single macros that encapsulate the common use-cases.
Three such macros are enough to cover all but two special-purpose contexts;
those two calls can be left as-is, I think.
While this patch doesn't in itself improve matters for third-party
extensions, it doesn't break anything for them either, and they can
gradually adopt the simplified notation over time.
In passing, change TopMemoryContext to use the default allocation
parameters. Formerly it could only be extended 8K at a time. That was
probably reasonable when this code was written; but nowadays we create
many more contexts than we did then, so that it's not unusual to have a
couple hundred K in TopMemoryContext, even without considering various
dubious code that sticks other things there. There seems no good reason
not to let it use growing blocks like most other contexts.
Back-patch to 9.6, mostly because that's still close enough to HEAD that
it's easy to do so, and keeping the branches in sync can be expected to
avoid some future back-patching pain. The bugs fixed by these changes
don't seem to be significant enough to justify fixing them further back.
Discussion: <21072.1472321324@sss.pgh.pa.us>
9 years ago
|
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
|
|
|
MemoryContextSwitchTo(checkpointer_context);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If an exception is encountered, processing resumes here.
|
|
|
|
|
*
|
Accept SIGQUIT during error recovery in auxiliary processes.
The bgwriter, checkpointer, walwriter, and walreceiver processes
claimed to allow SIGQUIT "at all times". In reality SIGQUIT
would get re-blocked during error recovery, because we didn't
update the actual signal mask immediately, so sigsetjmp() would
save and reinstate a mask that includes SIGQUIT.
This appears to be simply a coding oversight. There's never a
good reason to hold off SIGQUIT in these processes, because it's
going to just call _exit(2) which should be safe enough, especially
since the postmaster is going to tear down shared memory afterwards.
Hence, stick in PG_SETMASK() calls to install the modified BlockSig
mask immediately.
Also try to improve the comments around sigsetjmp blocks. Most of
them were just referencing postgres.c, which is misleading because
actually postgres.c manages the signals differently.
No back-patch, since there's no evidence that this is causing any
problems in the field.
Discussion: https://postgr.es/m/CALDaNm1d1hHPZUg3xU4XjtWBOLCrA+-2cJcLpw-cePZ=GgDVfA@mail.gmail.com
5 years ago
|
|
|
* You might wonder why this isn't coded as an infinite loop around a
|
|
|
|
|
* PG_TRY construct. The reason is that this is the bottom of the
|
|
|
|
|
* exception stack, and so with PG_TRY there would be no exception handler
|
|
|
|
|
* in force at all during the CATCH part. By leaving the outermost setjmp
|
|
|
|
|
* always active, we have at least some chance of recovering from an error
|
|
|
|
|
* during error recovery. (If we get into an infinite loop thereby, it
|
|
|
|
|
* will soon be stopped by overflow of elog.c's internal state stack.)
|
|
|
|
|
*
|
|
|
|
|
* Note that we use sigsetjmp(..., 1), so that the prevailing signal mask
|
|
|
|
|
* (to wit, BlockSig) will be restored when longjmp'ing to here. Thus,
|
|
|
|
|
* signals other than SIGQUIT will be blocked until we complete error
|
|
|
|
|
* recovery. It might seem that this policy makes the HOLD_INTERRUPTS()
|
|
|
|
|
* call redundant, but it is not since InterruptPending might be set
|
|
|
|
|
* already.
|
|
|
|
|
*/
|
|
|
|
|
if (sigsetjmp(local_sigjmp_buf, 1) != 0)
|
|
|
|
|
{
|
|
|
|
|
/* Since not using PG_TRY, must reset error stack by hand */
|
|
|
|
|
error_context_stack = NULL;
|
|
|
|
|
|
|
|
|
|
/* Prevent interrupts while cleaning up */
|
|
|
|
|
HOLD_INTERRUPTS();
|
|
|
|
|
|
|
|
|
|
/* Report the error to the server log */
|
|
|
|
|
EmitErrorReport();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* These operations are really just a minimal subset of
|
|
|
|
|
* AbortTransaction(). We don't have very many resources to worry
|
|
|
|
|
* about in checkpointer, but we do have LWLocks, buffers, and temp
|
|
|
|
|
* files.
|
|
|
|
|
*/
|
|
|
|
|
LWLockReleaseAll();
|
|
|
|
|
ConditionVariableCancelSleep();
|
|
|
|
|
pgstat_report_wait_end();
|
|
|
|
|
UnlockBuffers();
|
Use a ResourceOwner to track buffer pins in all cases.
Historically, we've allowed auxiliary processes to take buffer pins without
tracking them in a ResourceOwner. However, that creates problems for error
recovery. In particular, we've seen multiple reports of assertion crashes
in the startup process when it gets an error while holding a buffer pin,
as for example if it gets ENOSPC during a write. In a non-assert build,
the process would simply exit without releasing the pin at all. We've
gotten away with that so far just because a failure exit of the startup
process translates to a database crash anyhow; but any similar behavior
in other aux processes could result in stuck pins and subsequent problems
in vacuum.
To improve this, institute a policy that we must *always* have a resowner
backing any attempt to pin a buffer, which we can enforce just by removing
the previous special-case code in resowner.c. Add infrastructure to make
it easy to create a process-lifespan AuxProcessResourceOwner and clear
out its contents at appropriate times. Replace existing ad-hoc resowner
management in bgwriter.c and other aux processes with that. (Thus, while
the startup process gains a resowner where it had none at all before, some
other aux process types are replacing an ad-hoc resowner with this code.)
Also use the AuxProcessResourceOwner to manage buffer pins taken during
StartupXLOG and ShutdownXLOG, even when those are being run in a bootstrap
process or a standalone backend rather than a true auxiliary process.
In passing, remove some other ad-hoc resource owner creations that had
gotten cargo-culted into various other places. As far as I can tell
that was all unnecessary, and if it had been necessary it was incomplete,
due to lacking any provision for clearing those resowners later.
(Also worth noting in this connection is that a process that hasn't called
InitBufferPoolBackend has no business accessing buffers; so there's more
to do than just add the resowner if we want to touch buffers in processes
not covered by this patch.)
Although this fixes a very old bug, no back-patch, because there's no
evidence of any significant problem in non-assert builds.
Patch by me, pursuant to a report from Justin Pryzby. Thanks to
Robert Haas and Kyotaro Horiguchi for reviews.
Discussion: https://postgr.es/m/20180627233939.GA10276@telsasoft.com
8 years ago
|
|
|
ReleaseAuxProcessResources(false);
|
|
|
|
|
AtEOXact_Buffers(false);
|
|
|
|
|
AtEOXact_SMgr();
|
|
|
|
|
AtEOXact_Files(false);
|
|
|
|
|
AtEOXact_HashTables(false);
|
|
|
|
|
|
|
|
|
|
/* Warn any waiting backends that the checkpoint failed. */
|
|
|
|
|
if (ckpt_active)
|
|
|
|
|
{
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
CheckpointerShmem->ckpt_failed++;
|
|
|
|
|
CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
|
|
|
|
|
|
|
|
|
|
ckpt_active = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Now return to normal top-level context and clear ErrorContext for
|
|
|
|
|
* next time.
|
|
|
|
|
*/
|
|
|
|
|
MemoryContextSwitchTo(checkpointer_context);
|
|
|
|
|
FlushErrorState();
|
|
|
|
|
|
|
|
|
|
/* Flush any leaked data in the top-level context */
|
|
|
|
|
MemoryContextReset(checkpointer_context);
|
|
|
|
|
|
|
|
|
|
/* Now we can allow interrupts again */
|
|
|
|
|
RESUME_INTERRUPTS();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Sleep at least 1 second after any error. A write error is likely
|
|
|
|
|
* to be repeated, and we don't want to be filling the error logs as
|
|
|
|
|
* fast as we can.
|
|
|
|
|
*/
|
|
|
|
|
pg_usleep(1000000L);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We can now handle ereport(ERROR) */
|
|
|
|
|
PG_exception_stack = &local_sigjmp_buf;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unblock signals (they were blocked when the postmaster forked us)
|
|
|
|
|
*/
|
|
|
|
|
sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Ensure all shared memory values are set correctly for the config. Doing
|
|
|
|
|
* this here ensures no race conditions from other concurrent updaters.
|
|
|
|
|
*/
|
|
|
|
|
UpdateSharedMemoryConfig();
|
|
|
|
|
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
/*
|
|
|
|
|
* Advertise our proc number that backends can use to wake us up while
|
|
|
|
|
* we're sleeping.
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
*/
|
|
|
|
|
ProcGlobal->checkpointerProc = MyProcNumber;
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Loop until we've been asked to write the shutdown checkpoint or
|
|
|
|
|
* terminate.
|
|
|
|
|
*/
|
|
|
|
|
for (;;)
|
|
|
|
|
{
|
|
|
|
|
bool do_checkpoint = false;
|
|
|
|
|
int flags = 0;
|
|
|
|
|
pg_time_t now;
|
|
|
|
|
int elapsed_secs;
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
int cur_timeout;
|
|
|
|
|
bool chkpt_or_rstpt_requested = false;
|
|
|
|
|
bool chkpt_or_rstpt_timed = false;
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
/* Clear any already-pending wakeups */
|
|
|
|
|
ResetLatch(MyLatch);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Process any requests or signals received recently.
|
|
|
|
|
*/
|
|
|
|
|
AbsorbSyncRequests();
|
|
|
|
|
|
|
|
|
|
HandleCheckpointerInterrupts();
|
|
|
|
|
if (ShutdownXLOGPending || ShutdownRequestPending)
|
|
|
|
|
break;
|
|
|
|
|
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
/*
|
|
|
|
|
* Detect a pending checkpoint request by checking whether the flags
|
|
|
|
|
* word in shared memory is nonzero. We shouldn't need to acquire the
|
|
|
|
|
* ckpt_lck for this.
|
|
|
|
|
*/
|
|
|
|
|
if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
|
|
|
|
|
{
|
|
|
|
|
do_checkpoint = true;
|
|
|
|
|
chkpt_or_rstpt_requested = true;
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Force a checkpoint if too much time has elapsed since the last one.
|
|
|
|
|
* Note that we count a timed checkpoint in stats only when this
|
|
|
|
|
* occurs without an external request, but we set the CAUSE_TIME flag
|
|
|
|
|
* bit even if there is also an external request.
|
|
|
|
|
*/
|
|
|
|
|
now = (pg_time_t) time(NULL);
|
|
|
|
|
elapsed_secs = now - last_checkpoint_time;
|
|
|
|
|
if (elapsed_secs >= CheckPointTimeout)
|
|
|
|
|
{
|
|
|
|
|
if (!do_checkpoint)
|
|
|
|
|
chkpt_or_rstpt_timed = true;
|
|
|
|
|
do_checkpoint = true;
|
|
|
|
|
flags |= CHECKPOINT_CAUSE_TIME;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Do a checkpoint if requested.
|
|
|
|
|
*/
|
|
|
|
|
if (do_checkpoint)
|
|
|
|
|
{
|
|
|
|
|
bool ckpt_performed = false;
|
|
|
|
|
bool do_restartpoint;
|
|
|
|
|
|
|
|
|
|
/* Check if we should perform a checkpoint or a restartpoint. */
|
|
|
|
|
do_restartpoint = RecoveryInProgress();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Atomically fetch the request flags to figure out what kind of a
|
|
|
|
|
* checkpoint we should perform, and increase the started-counter
|
|
|
|
|
* to acknowledge that we've started a new checkpoint.
|
|
|
|
|
*/
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
flags |= CheckpointerShmem->ckpt_flags;
|
|
|
|
|
CheckpointerShmem->ckpt_flags = 0;
|
|
|
|
|
CheckpointerShmem->ckpt_started++;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The end-of-recovery checkpoint is a real checkpoint that's
|
|
|
|
|
* performed while we're still in recovery.
|
|
|
|
|
*/
|
|
|
|
|
if (flags & CHECKPOINT_END_OF_RECOVERY)
|
|
|
|
|
do_restartpoint = false;
|
|
|
|
|
|
|
|
|
|
if (chkpt_or_rstpt_timed)
|
|
|
|
|
{
|
|
|
|
|
chkpt_or_rstpt_timed = false;
|
|
|
|
|
if (do_restartpoint)
|
|
|
|
|
PendingCheckpointerStats.restartpoints_timed++;
|
|
|
|
|
else
|
|
|
|
|
PendingCheckpointerStats.num_timed++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (chkpt_or_rstpt_requested)
|
|
|
|
|
{
|
|
|
|
|
chkpt_or_rstpt_requested = false;
|
|
|
|
|
if (do_restartpoint)
|
|
|
|
|
PendingCheckpointerStats.restartpoints_requested++;
|
|
|
|
|
else
|
|
|
|
|
PendingCheckpointerStats.num_requested++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We will warn if (a) too soon since last checkpoint (whatever
|
|
|
|
|
* caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
|
|
|
|
|
* since the last checkpoint start. Note in particular that this
|
|
|
|
|
* implementation will not generate warnings caused by
|
|
|
|
|
* CheckPointTimeout < CheckPointWarning.
|
|
|
|
|
*/
|
|
|
|
|
if (!do_restartpoint &&
|
|
|
|
|
(flags & CHECKPOINT_CAUSE_XLOG) &&
|
|
|
|
|
elapsed_secs < CheckPointWarning)
|
|
|
|
|
ereport(LOG,
|
|
|
|
|
(errmsg_plural("checkpoints are occurring too frequently (%d second apart)",
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
9 years ago
|
|
|
"checkpoints are occurring too frequently (%d seconds apart)",
|
|
|
|
|
elapsed_secs,
|
|
|
|
|
elapsed_secs),
|
|
|
|
|
errhint("Consider increasing the configuration parameter \"%s\".", "max_wal_size")));
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Initialize checkpointer-private variables used during
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
* checkpoint.
|
|
|
|
|
*/
|
|
|
|
|
ckpt_active = true;
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
if (do_restartpoint)
|
|
|
|
|
ckpt_start_recptr = GetXLogReplayRecPtr(NULL);
|
|
|
|
|
else
|
|
|
|
|
ckpt_start_recptr = GetInsertRecPtr();
|
|
|
|
|
ckpt_start_time = now;
|
|
|
|
|
ckpt_cached_elapsed = 0;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Do the checkpoint.
|
|
|
|
|
*/
|
|
|
|
|
if (!do_restartpoint)
|
|
|
|
|
ckpt_performed = CreateCheckPoint(flags);
|
|
|
|
|
else
|
|
|
|
|
ckpt_performed = CreateRestartPoint(flags);
|
|
|
|
|
|
|
|
|
|
/*
|
Give SMgrRelation pointers a well-defined lifetime.
After calling smgropen(), it was not clear how long you could continue
to use the result, because various code paths including cache
invalidation could call smgrclose(), which freed the memory.
Guarantee that the object won't be destroyed until the end of the
current transaction, or in recovery, the commit/abort record that
destroys the underlying storage.
smgrclose() is now just an alias for smgrrelease(). It closes files
and forgets all state except the rlocator, but keeps the SMgrRelation
object valid.
A new smgrdestroy() function is used by rare places that know there
should be no other references to the SMgrRelation.
The short version:
* smgrclose() is now just an alias for smgrrelease(). It releases
resources, but doesn't destroy until EOX
* smgrdestroy() now frees memory, and should rarely be used.
Existing code should be unaffected, but it is now possible for code that
has an SMgrRelation object to use it repeatedly during a transaction as
long as the storage hasn't been physically dropped. Such code would
normally hold a lock on the relation.
This also replaces the "ownership" mechanism of SMgrRelations with a
pin counter. An SMgrRelation can now be "pinned", which prevents it
from being destroyed at end of transaction. There can be multiple pins
on the same SMgrRelation. In practice, the pin mechanism is only used
by the relcache, so there cannot be more than one pin on the same
SMgrRelation. Except with swap_relation_files XXX
Author: Thomas Munro, Heikki Linnakangas
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGJ8NTvqLHz6dqbQnt2c8XCki4r2QvXjBQcXpVwxTY_pvA@mail.gmail.com
2 years ago
|
|
|
* After any checkpoint, free all smgr objects. Otherwise we
|
|
|
|
|
* would never do so for dropped relations, as the checkpointer
|
|
|
|
|
* does not process shared invalidation messages or call
|
|
|
|
|
* AtEOXact_SMgr().
|
|
|
|
|
*/
|
Give SMgrRelation pointers a well-defined lifetime.
After calling smgropen(), it was not clear how long you could continue
to use the result, because various code paths including cache
invalidation could call smgrclose(), which freed the memory.
Guarantee that the object won't be destroyed until the end of the
current transaction, or in recovery, the commit/abort record that
destroys the underlying storage.
smgrclose() is now just an alias for smgrrelease(). It closes files
and forgets all state except the rlocator, but keeps the SMgrRelation
object valid.
A new smgrdestroy() function is used by rare places that know there
should be no other references to the SMgrRelation.
The short version:
* smgrclose() is now just an alias for smgrrelease(). It releases
resources, but doesn't destroy until EOX
* smgrdestroy() now frees memory, and should rarely be used.
Existing code should be unaffected, but it is now possible for code that
has an SMgrRelation object to use it repeatedly during a transaction as
long as the storage hasn't been physically dropped. Such code would
normally hold a lock on the relation.
This also replaces the "ownership" mechanism of SMgrRelations with a
pin counter. An SMgrRelation can now be "pinned", which prevents it
from being destroyed at end of transaction. There can be multiple pins
on the same SMgrRelation. In practice, the pin mechanism is only used
by the relcache, so there cannot be more than one pin on the same
SMgrRelation. Except with swap_relation_files XXX
Author: Thomas Munro, Heikki Linnakangas
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGJ8NTvqLHz6dqbQnt2c8XCki4r2QvXjBQcXpVwxTY_pvA@mail.gmail.com
2 years ago
|
|
|
smgrdestroyall();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Indicate checkpoint completion to any waiting backends.
|
|
|
|
|
*/
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
|
|
|
|
|
|
|
|
|
|
if (!do_restartpoint)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Note we record the checkpoint start time not end time as
|
|
|
|
|
* last_checkpoint_time. This is so that time-driven
|
|
|
|
|
* checkpoints happen at a predictable spacing.
|
|
|
|
|
*/
|
|
|
|
|
last_checkpoint_time = now;
|
|
|
|
|
|
|
|
|
|
if (ckpt_performed)
|
|
|
|
|
PendingCheckpointerStats.num_performed++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
if (ckpt_performed)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* The same as for checkpoint. Please see the
|
|
|
|
|
* corresponding comment.
|
|
|
|
|
*/
|
|
|
|
|
last_checkpoint_time = now;
|
|
|
|
|
|
|
|
|
|
PendingCheckpointerStats.restartpoints_performed++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* We were not able to perform the restartpoint
|
|
|
|
|
* (checkpoints throw an ERROR in case of error). Most
|
|
|
|
|
* likely because we have not received any new checkpoint
|
|
|
|
|
* WAL records since the last restartpoint. Try again in
|
|
|
|
|
* 15 s.
|
|
|
|
|
*/
|
|
|
|
|
last_checkpoint_time = now - CheckPointTimeout + 15;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ckpt_active = false;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We may have received an interrupt during the checkpoint and the
|
|
|
|
|
* latch might have been reset (e.g. in CheckpointWriteDelay).
|
|
|
|
|
*/
|
|
|
|
|
HandleCheckpointerInterrupts();
|
|
|
|
|
if (ShutdownXLOGPending || ShutdownRequestPending)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
/* Check for archive_timeout and switch xlog files if necessary. */
|
|
|
|
|
CheckArchiveTimeout();
|
|
|
|
|
|
|
|
|
|
/* Report pending statistics to the cumulative stats system */
|
|
|
|
|
pgstat_report_checkpointer();
|
|
|
|
|
pgstat_report_wal(true);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If any checkpoint flags have been set, redo the loop to handle the
|
|
|
|
|
* checkpoint without sleeping.
|
|
|
|
|
*/
|
|
|
|
|
if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Sleep until we are signaled or it's time for another checkpoint or
|
|
|
|
|
* xlog file switch.
|
|
|
|
|
*/
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
now = (pg_time_t) time(NULL);
|
|
|
|
|
elapsed_secs = now - last_checkpoint_time;
|
|
|
|
|
if (elapsed_secs >= CheckPointTimeout)
|
|
|
|
|
continue; /* no sleep for us ... */
|
|
|
|
|
cur_timeout = CheckPointTimeout - elapsed_secs;
|
|
|
|
|
if (XLogArchiveTimeout > 0 && !RecoveryInProgress())
|
|
|
|
|
{
|
|
|
|
|
elapsed_secs = now - last_xlog_switch_time;
|
|
|
|
|
if (elapsed_secs >= XLogArchiveTimeout)
|
|
|
|
|
continue; /* no sleep for us ... */
|
|
|
|
|
cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
|
|
|
|
|
}
|
|
|
|
|
|
Add WL_EXIT_ON_PM_DEATH pseudo-event.
Users of the WaitEventSet and WaitLatch() APIs can now choose between
asking for WL_POSTMASTER_DEATH and then handling it explicitly, or asking
for WL_EXIT_ON_PM_DEATH to trigger immediate exit on postmaster death.
This reduces code duplication, since almost all callers want the latter.
Repair all code that was previously ignoring postmaster death completely,
or requesting the event but ignoring it, or requesting the event but then
doing an unconditional PostmasterIsAlive() call every time through its
event loop (which is an expensive syscall on platforms for which we don't
have USE_POSTMASTER_DEATH_SIGNAL support).
Assert that callers of WaitLatchXXX() under the postmaster remember to
ask for either WL_POSTMASTER_DEATH or WL_EXIT_ON_PM_DEATH, to prevent
future bugs.
The only process that doesn't handle postmaster death is syslogger. It
waits until all backends holding the write end of the syslog pipe
(including the postmaster) have closed it by exiting, to be sure to
capture any parting messages. By using the WaitEventSet API directly
it avoids the new assertion, and as a by-product it may be slightly
more efficient on platforms that have epoll().
Author: Thomas Munro
Reviewed-by: Kyotaro Horiguchi, Heikki Linnakangas, Tom Lane
Discussion: https://postgr.es/m/CAEepm%3D1TCviRykkUb69ppWLr_V697rzd1j3eZsRMmbXvETfqbQ%40mail.gmail.com,
https://postgr.es/m/CAEepm=2LqHzizbe7muD7-2yHUbTOoF7Q+qkSD5Q41kuhttRTwA@mail.gmail.com
7 years ago
|
|
|
(void) WaitLatch(MyLatch,
|
|
|
|
|
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
|
|
|
|
|
cur_timeout * 1000L /* convert to ms */ ,
|
|
|
|
|
WAIT_EVENT_CHECKPOINTER_MAIN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* From here on, elog(ERROR) should end with exit(1), not send control
|
|
|
|
|
* back to the sigsetjmp block above.
|
|
|
|
|
*/
|
|
|
|
|
ExitOnAnyError = true;
|
|
|
|
|
|
|
|
|
|
if (ShutdownXLOGPending)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Close down the database.
|
|
|
|
|
*
|
|
|
|
|
* Since ShutdownXLOG() creates restartpoint or checkpoint, and
|
|
|
|
|
* updates the statistics, increment the checkpoint request and flush
|
|
|
|
|
* out pending statistic.
|
|
|
|
|
*/
|
|
|
|
|
PendingCheckpointerStats.num_requested++;
|
|
|
|
|
ShutdownXLOG(0, 0);
|
|
|
|
|
pgstat_report_checkpointer();
|
|
|
|
|
pgstat_report_wal(true);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Tell postmaster that we're done.
|
|
|
|
|
*/
|
|
|
|
|
SendPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN);
|
|
|
|
|
ShutdownXLOGPending = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Wait until we're asked to shut down. By separating the writing of the
|
|
|
|
|
* shutdown checkpoint from checkpointer exiting, checkpointer can perform
|
|
|
|
|
* some should-be-as-late-as-possible work like writing out stats.
|
|
|
|
|
*/
|
|
|
|
|
for (;;)
|
|
|
|
|
{
|
|
|
|
|
/* Clear any already-pending wakeups */
|
|
|
|
|
ResetLatch(MyLatch);
|
|
|
|
|
|
|
|
|
|
HandleCheckpointerInterrupts();
|
|
|
|
|
|
|
|
|
|
if (ShutdownRequestPending)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
(void) WaitLatch(MyLatch,
|
|
|
|
|
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
|
|
|
|
|
0,
|
|
|
|
|
WAIT_EVENT_CHECKPOINTER_SHUTDOWN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Normal exit from the checkpointer is here */
|
|
|
|
|
proc_exit(0); /* done */
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Process any new interrupts.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
HandleCheckpointerInterrupts(void)
|
|
|
|
|
{
|
|
|
|
|
if (ProcSignalBarrierPending)
|
|
|
|
|
ProcessProcSignalBarrier();
|
|
|
|
|
|
|
|
|
|
if (ConfigReloadPending)
|
|
|
|
|
{
|
|
|
|
|
ConfigReloadPending = false;
|
|
|
|
|
ProcessConfigFile(PGC_SIGHUP);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Checkpointer is the last process to shut down, so we ask it to hold
|
|
|
|
|
* the keys for a range of other tasks required most of which have
|
|
|
|
|
* nothing to do with checkpointing at all.
|
|
|
|
|
*
|
|
|
|
|
* For various reasons, some config values can change dynamically so
|
|
|
|
|
* the primary copy of them is held in shared memory to make sure all
|
|
|
|
|
* backends see the same value. We make Checkpointer responsible for
|
|
|
|
|
* updating the shared memory copy if the parameter setting changes
|
|
|
|
|
* because of SIGHUP.
|
|
|
|
|
*/
|
|
|
|
|
UpdateSharedMemoryConfig();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Perform logging of memory contexts of this process */
|
|
|
|
|
if (LogMemoryContextPending)
|
|
|
|
|
ProcessLogMemoryContextInterrupt();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CheckArchiveTimeout -- check for archive_timeout and switch xlog files
|
|
|
|
|
*
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
* This will switch to a new WAL file and force an archive file write if
|
|
|
|
|
* meaningful activity is recorded in the current WAL file. This includes most
|
|
|
|
|
* writes, including just a single checkpoint record, but excludes WAL records
|
|
|
|
|
* that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like
|
|
|
|
|
* snapshots of running transactions). Such records, depending on
|
|
|
|
|
* configuration, occur on regular intervals and don't contain important
|
|
|
|
|
* information. This avoids generating archives with a few unimportant
|
|
|
|
|
* records.
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
CheckArchiveTimeout(void)
|
|
|
|
|
{
|
|
|
|
|
pg_time_t now;
|
|
|
|
|
pg_time_t last_time;
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
XLogRecPtr last_switch_lsn;
|
|
|
|
|
|
|
|
|
|
if (XLogArchiveTimeout <= 0 || RecoveryInProgress())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
now = (pg_time_t) time(NULL);
|
|
|
|
|
|
|
|
|
|
/* First we do a quick check using possibly-stale local state. */
|
|
|
|
|
if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Update local state ... note that last_xlog_switch_time is the last time
|
|
|
|
|
* a switch was performed *or requested*.
|
|
|
|
|
*/
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
last_time = GetLastSegSwitchData(&last_switch_lsn);
|
|
|
|
|
|
|
|
|
|
last_xlog_switch_time = Max(last_xlog_switch_time, last_time);
|
|
|
|
|
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
/* Now we can do the real checks */
|
|
|
|
|
if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout)
|
|
|
|
|
{
|
|
|
|
|
/*
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
* Switch segment only when "important" WAL has been logged since the
|
|
|
|
|
* last segment switch (last_switch_lsn points to end of segment
|
|
|
|
|
* switch occurred in).
|
|
|
|
|
*/
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
if (GetLastImportantRecPtr() > last_switch_lsn)
|
|
|
|
|
{
|
|
|
|
|
XLogRecPtr switchpoint;
|
|
|
|
|
|
|
|
|
|
/* mark switch as unimportant, avoids triggering checkpoints */
|
|
|
|
|
switchpoint = RequestXLogSwitch(true);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the returned pointer points exactly to a segment boundary,
|
|
|
|
|
* assume nothing happened.
|
|
|
|
|
*/
|
Make WAL segment size configurable at initdb time.
For performance reasons a larger segment size than the default 16MB
can be useful. A larger segment size has two main benefits: Firstly,
in setups using archiving, it makes it easier to write scripts that
can keep up with higher amounts of WAL, secondly, the WAL has to be
written and synced to disk less frequently.
But at the same time large segment size are disadvantageous for
smaller databases. So far the segment size had to be configured at
compile time, often making it unrealistic to choose one fitting to a
particularly load. Therefore change it to a initdb time setting.
This includes a breaking changes to the xlogreader.h API, which now
requires the current segment size to be configured. For that and
similar reasons a number of binaries had to be taught how to recognize
the current segment size.
Author: Beena Emerson, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Kuntal Ghosh, Michael
Paquier, Peter Eisentraut, Robert Hass, Tushar Ahuja
Discussion: https://postgr.es/m/CAOG9ApEAcQ--1ieKbhFzXSQPw_YLmepaa4hNdnY5+ZULpt81Mw@mail.gmail.com
8 years ago
|
|
|
if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0)
|
|
|
|
|
elog(DEBUG1, "write-ahead log switch forced (\"archive_timeout\"=%d)",
|
|
|
|
|
XLogArchiveTimeout);
|
Skip checkpoints, archiving on idle systems.
Some background activity (like checkpoints, archive timeout, standby
snapshots) is not supposed to happen on an idle system. Unfortunately
so far it was not easy to determine when a system is idle, which
defeated some of the attempts to avoid redundant activity on an idle
system.
To make that easier, allow to make individual WAL insertions as not
being "important". By checking whether any important activity happened
since the last time an activity was performed, it now is easy to check
whether some action needs to be repeated.
Use the new facility for checkpoints, archive timeout and standby
snapshots.
The lack of a facility causes some issues in older releases, but in my
opinion the consequences (superflous checkpoints / archived segments)
aren't grave enough to warrant backpatching.
Author: Michael Paquier, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Amit Kapila, Kyotaro HORIGUCHI
Bug: #13685
Discussion:
https://www.postgresql.org/message-id/20151016203031.3019.72930@wrigleys.postgresql.org
https://www.postgresql.org/message-id/CAB7nPqQcPqxEM3S735Bd2RzApNqSNJVietAC=6kfkYv_45dKwA@mail.gmail.com
Backpatch: -
9 years ago
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Update state in any case, so we don't retry constantly when the
|
|
|
|
|
* system is idle.
|
|
|
|
|
*/
|
|
|
|
|
last_xlog_switch_time = now;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Returns true if an immediate checkpoint request is pending. (Note that
|
|
|
|
|
* this does not check the *current* checkpoint's IMMEDIATE flag, but whether
|
|
|
|
|
* there is one pending behind it.)
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
ImmediateCheckpointRequested(void)
|
|
|
|
|
{
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
volatile CheckpointerShmemStruct *cps = CheckpointerShmem;
|
|
|
|
|
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
/*
|
|
|
|
|
* We don't need to acquire the ckpt_lck in this case because we're only
|
|
|
|
|
* looking at a single flag bit.
|
|
|
|
|
*/
|
|
|
|
|
if (cps->ckpt_flags & CHECKPOINT_IMMEDIATE)
|
|
|
|
|
return true;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CheckpointWriteDelay -- control rate of checkpoint
|
|
|
|
|
*
|
|
|
|
|
* This function is called after each page write performed by BufferSync().
|
|
|
|
|
* It is responsible for throttling BufferSync()'s write rate to hit
|
|
|
|
|
* checkpoint_completion_target.
|
|
|
|
|
*
|
|
|
|
|
* The checkpoint request flags should be passed in; currently the only one
|
|
|
|
|
* examined is CHECKPOINT_IMMEDIATE, which disables delays between writes.
|
|
|
|
|
*
|
|
|
|
|
* 'progress' is an estimate of how much of the work has been done, as a
|
|
|
|
|
* fraction between 0.0 meaning none, and 1.0 meaning all done.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
CheckpointWriteDelay(int flags, double progress)
|
|
|
|
|
{
|
|
|
|
|
static int absorb_counter = WRITES_PER_ABSORB;
|
|
|
|
|
|
|
|
|
|
/* Do nothing if checkpoint is being executed by non-checkpointer process */
|
Fix management of pendingOpsTable in auxiliary processes.
mdinit() was misusing IsBootstrapProcessingMode() to decide whether to
create an fsync pending-operations table in the current process. This led
to creating a table not only in the startup and checkpointer processes as
intended, but also in the bgwriter process, not to mention other auxiliary
processes such as walwriter and walreceiver. Creation of the table in the
bgwriter is fatal, because it absorbs fsync requests that should have gone
to the checkpointer; instead they just sit in bgwriter local memory and are
never acted on. So writes performed by the bgwriter were not being fsync'd
which could result in data loss after an OS crash. I think there is no
live bug with respect to walwriter and walreceiver because those never
perform any writes of shared buffers; but the potential is there for
future breakage in those processes too.
To fix, make AuxiliaryProcessMain() export the current process's
AuxProcType as a global variable, and then make mdinit() test directly for
the types of aux process that should have a pendingOpsTable. Having done
that, we might as well also get rid of the random bool flags such as
am_walreceiver that some of the aux processes had grown. (Note that we
could not have fixed the bug by examining those variables in mdinit(),
because it's called from BaseInit() which is run by AuxiliaryProcessMain()
before entering any of the process-type-specific code.)
Back-patch to 9.2, where the problem was introduced by the split-up of
bgwriter and checkpointer processes. The bogus pendingOpsTable exists
in walwriter and walreceiver processes in earlier branches, but absent
any evidence that it causes actual problems there, I'll leave the older
branches alone.
14 years ago
|
|
|
if (!AmCheckpointerProcess())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Perform the usual duties and take a nap, unless we're behind schedule,
|
|
|
|
|
* in which case we just try to catch up as quickly as possible.
|
|
|
|
|
*/
|
|
|
|
|
if (!(flags & CHECKPOINT_IMMEDIATE) &&
|
|
|
|
|
!ShutdownXLOGPending &&
|
|
|
|
|
!ShutdownRequestPending &&
|
|
|
|
|
!ImmediateCheckpointRequested() &&
|
|
|
|
|
IsCheckpointOnSchedule(progress))
|
|
|
|
|
{
|
|
|
|
|
if (ConfigReloadPending)
|
|
|
|
|
{
|
|
|
|
|
ConfigReloadPending = false;
|
|
|
|
|
ProcessConfigFile(PGC_SIGHUP);
|
|
|
|
|
/* update shmem copies of config variables */
|
|
|
|
|
UpdateSharedMemoryConfig();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
AbsorbSyncRequests();
|
|
|
|
|
absorb_counter = WRITES_PER_ABSORB;
|
|
|
|
|
|
|
|
|
|
CheckArchiveTimeout();
|
|
|
|
|
|
|
|
|
|
/* Report interim statistics to the cumulative stats system */
|
|
|
|
|
pgstat_report_checkpointer();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This sleep used to be connected to bgwriter_delay, typically 200ms.
|
|
|
|
|
* That resulted in more frequent wakeups if not much work to do.
|
|
|
|
|
* Checkpointer and bgwriter are no longer related so take the Big
|
|
|
|
|
* Sleep.
|
|
|
|
|
*/
|
|
|
|
|
WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
|
|
|
|
100,
|
|
|
|
|
WAIT_EVENT_CHECKPOINT_WRITE_DELAY);
|
|
|
|
|
ResetLatch(MyLatch);
|
|
|
|
|
}
|
|
|
|
|
else if (--absorb_counter <= 0)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* Absorb pending fsync requests after each WRITES_PER_ABSORB write
|
|
|
|
|
* operations even when we don't sleep, to prevent overflow of the
|
|
|
|
|
* fsync request queue.
|
|
|
|
|
*/
|
|
|
|
|
AbsorbSyncRequests();
|
|
|
|
|
absorb_counter = WRITES_PER_ABSORB;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Check for barrier events. */
|
|
|
|
|
if (ProcSignalBarrierPending)
|
|
|
|
|
ProcessProcSignalBarrier();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
* (or restartpoint) in time?
|
|
|
|
|
*
|
|
|
|
|
* Compares the current progress against the time/segments elapsed since last
|
|
|
|
|
* checkpoint, and returns true if the progress we've made this far is greater
|
|
|
|
|
* than the elapsed time/segments.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
|
|
|
|
IsCheckpointOnSchedule(double progress)
|
|
|
|
|
{
|
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
struct timeval now;
|
|
|
|
|
double elapsed_xlogs,
|
|
|
|
|
elapsed_time;
|
|
|
|
|
|
|
|
|
|
Assert(ckpt_active);
|
|
|
|
|
|
|
|
|
|
/* Scale progress according to checkpoint_completion_target. */
|
|
|
|
|
progress *= CheckPointCompletionTarget;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check against the cached value first. Only do the more expensive
|
|
|
|
|
* calculations once we reach the target previously calculated. Since
|
|
|
|
|
* neither time or WAL insert pointer moves backwards, a freshly
|
|
|
|
|
* calculated value can only be greater than or equal to the cached value.
|
|
|
|
|
*/
|
|
|
|
|
if (progress < ckpt_cached_elapsed)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check progress against WAL segments written and CheckPointSegments.
|
|
|
|
|
*
|
|
|
|
|
* We compare the current WAL insert location against the location
|
|
|
|
|
* computed before calling CreateCheckPoint. The code in XLogInsert that
|
|
|
|
|
* actually triggers a checkpoint when CheckPointSegments is exceeded
|
|
|
|
|
* compares against RedoRecPtr, so this is not completely accurate.
|
|
|
|
|
* However, it's good enough for our purposes, we're only calculating an
|
|
|
|
|
* estimate anyway.
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
*
|
|
|
|
|
* During recovery, we compare last replayed WAL record's location with
|
|
|
|
|
* the location computed before calling CreateRestartPoint. That maintains
|
|
|
|
|
* the same pacing as we have during checkpoints in normal operation, but
|
|
|
|
|
* we might exceed max_wal_size by a fair amount. That's because there can
|
|
|
|
|
* be a large gap between a checkpoint's redo-pointer and the checkpoint
|
|
|
|
|
* record itself, and we only start the restartpoint after we've seen the
|
|
|
|
|
* checkpoint record. (The gap is typically up to CheckPointSegments *
|
|
|
|
|
* checkpoint_completion_target where checkpoint_completion_target is the
|
|
|
|
|
* value that was in effect when the WAL was generated).
|
|
|
|
|
*/
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
if (RecoveryInProgress())
|
|
|
|
|
recptr = GetXLogReplayRecPtr(NULL);
|
|
|
|
|
else
|
|
|
|
|
recptr = GetInsertRecPtr();
|
Make WAL segment size configurable at initdb time.
For performance reasons a larger segment size than the default 16MB
can be useful. A larger segment size has two main benefits: Firstly,
in setups using archiving, it makes it easier to write scripts that
can keep up with higher amounts of WAL, secondly, the WAL has to be
written and synced to disk less frequently.
But at the same time large segment size are disadvantageous for
smaller databases. So far the segment size had to be configured at
compile time, often making it unrealistic to choose one fitting to a
particularly load. Therefore change it to a initdb time setting.
This includes a breaking changes to the xlogreader.h API, which now
requires the current segment size to be configured. For that and
similar reasons a number of binaries had to be taught how to recognize
the current segment size.
Author: Beena Emerson, editorialized by Andres Freund
Reviewed-By: Andres Freund, David Steele, Kuntal Ghosh, Michael
Paquier, Peter Eisentraut, Robert Hass, Tushar Ahuja
Discussion: https://postgr.es/m/CAOG9ApEAcQ--1ieKbhFzXSQPw_YLmepaa4hNdnY5+ZULpt81Mw@mail.gmail.com
8 years ago
|
|
|
elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) /
|
|
|
|
|
wal_segment_size) / CheckPointSegments;
|
|
|
|
|
|
Also trigger restartpoints based on max_wal_size on standby.
When archive recovery and restartpoints were initially introduced,
checkpoint_segments was ignored on the grounds that the files restored from
archive don't consume any space in the recovery server. That was changed in
later releases, but even then it was arguably a feature rather than a bug,
as performing restartpoints as often as checkpoints during normal operation
might be excessive, but you might nevertheless not want to waste a lot of
space for pre-allocated WAL by setting checkpoint_segments to a high value.
But now that we have separate min_wal_size and max_wal_size settings, you
can bound WAL usage with max_wal_size, and still avoid consuming excessive
space usage by setting min_wal_size to a lower value, so that argument is
moot.
There are still some issues with actually limiting the space usage to
max_wal_size: restartpoints in recovery can only start after seeing the
checkpoint record, while a checkpoint starts flushing buffers as soon as
the redo-pointer is set. Restartpoint is paced to happen at the same
leisurily speed, determined by checkpoint_completion_target, as checkpoints,
but because they are started later, max_wal_size can be exceeded by upto
one checkpoint cycle's worth of WAL, depending on
checkpoint_completion_target. But that seems better than not trying at all,
and max_wal_size is a soft limit anyway.
The documentation already claimed that max_wal_size is obeyed in recovery,
so this just fixes the behaviour to match the docs. However, add some
weasel-words there to mention that max_wal_size may well be exceeded by
some amount in recovery.
11 years ago
|
|
|
if (progress < elapsed_xlogs)
|
|
|
|
|
{
|
|
|
|
|
ckpt_cached_elapsed = elapsed_xlogs;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check progress against time elapsed and checkpoint_timeout.
|
|
|
|
|
*/
|
|
|
|
|
gettimeofday(&now, NULL);
|
|
|
|
|
elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) +
|
|
|
|
|
now.tv_usec / 1000000.0) / CheckPointTimeout;
|
|
|
|
|
|
|
|
|
|
if (progress < elapsed_time)
|
|
|
|
|
{
|
|
|
|
|
ckpt_cached_elapsed = elapsed_time;
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* It looks like we're on schedule. */
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* --------------------------------
|
|
|
|
|
* signal handler routines
|
|
|
|
|
* --------------------------------
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* SIGINT: set flag to trigger writing of shutdown checkpoint */
|
|
|
|
|
static void
|
|
|
|
|
ReqShutdownXLOG(SIGNAL_ARGS)
|
|
|
|
|
{
|
|
|
|
|
ShutdownXLOGPending = true;
|
|
|
|
|
SetLatch(MyLatch);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* --------------------------------
|
|
|
|
|
* communication with backends
|
|
|
|
|
* --------------------------------
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CheckpointerShmemSize
|
|
|
|
|
* Compute space needed for checkpointer-related shared memory
|
|
|
|
|
*/
|
|
|
|
|
Size
|
|
|
|
|
CheckpointerShmemSize(void)
|
|
|
|
|
{
|
|
|
|
|
Size size;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Currently, the size of the requests[] array is arbitrarily set equal to
|
|
|
|
|
* NBuffers. This may prove too large or small ...
|
|
|
|
|
*/
|
|
|
|
|
size = offsetof(CheckpointerShmemStruct, requests);
|
|
|
|
|
size = add_size(size, mul_size(NBuffers, sizeof(CheckpointerRequest)));
|
|
|
|
|
|
|
|
|
|
return size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CheckpointerShmemInit
|
|
|
|
|
* Allocate and initialize checkpointer-related shared memory
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
CheckpointerShmemInit(void)
|
|
|
|
|
{
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
Size size = CheckpointerShmemSize();
|
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
|
|
CheckpointerShmem = (CheckpointerShmemStruct *)
|
|
|
|
|
ShmemInitStruct("Checkpointer Data",
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
size,
|
|
|
|
|
&found);
|
|
|
|
|
|
|
|
|
|
if (!found)
|
|
|
|
|
{
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
/*
|
|
|
|
|
* First time through, so initialize. Note that we zero the whole
|
|
|
|
|
* requests array; this is so that CompactCheckpointerRequestQueue can
|
|
|
|
|
* assume that any pad bytes in the request structs are zeroes.
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
*/
|
|
|
|
|
MemSet(CheckpointerShmem, 0, size);
|
|
|
|
|
SpinLockInit(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
CheckpointerShmem->max_requests = NBuffers;
|
|
|
|
|
ConditionVariableInit(&CheckpointerShmem->start_cv);
|
|
|
|
|
ConditionVariableInit(&CheckpointerShmem->done_cv);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* RequestCheckpoint
|
|
|
|
|
* Called in backend processes to request a checkpoint
|
|
|
|
|
*
|
|
|
|
|
* flags is a bitwise OR of the following:
|
|
|
|
|
* CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
|
|
|
|
|
* CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
|
|
|
|
|
* CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
|
|
|
|
|
* ignoring checkpoint_completion_target parameter.
|
|
|
|
|
* CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
|
|
|
|
|
* since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
|
|
|
|
|
* CHECKPOINT_END_OF_RECOVERY).
|
|
|
|
|
* CHECKPOINT_WAIT: wait for completion before returning (otherwise,
|
|
|
|
|
* just signal checkpointer to do it, and return).
|
|
|
|
|
* CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling.
|
|
|
|
|
* (This affects logging, and in particular enables CheckPointWarning.)
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
RequestCheckpoint(int flags)
|
|
|
|
|
{
|
|
|
|
|
int ntries;
|
|
|
|
|
int old_failed,
|
|
|
|
|
old_started;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If in a standalone backend, just do it ourselves.
|
|
|
|
|
*/
|
|
|
|
|
if (!IsPostmasterEnvironment)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* There's no point in doing slow checkpoints in a standalone backend,
|
|
|
|
|
* because there's no other backends the checkpoint could disrupt.
|
|
|
|
|
*/
|
|
|
|
|
CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE);
|
|
|
|
|
|
Give SMgrRelation pointers a well-defined lifetime.
After calling smgropen(), it was not clear how long you could continue
to use the result, because various code paths including cache
invalidation could call smgrclose(), which freed the memory.
Guarantee that the object won't be destroyed until the end of the
current transaction, or in recovery, the commit/abort record that
destroys the underlying storage.
smgrclose() is now just an alias for smgrrelease(). It closes files
and forgets all state except the rlocator, but keeps the SMgrRelation
object valid.
A new smgrdestroy() function is used by rare places that know there
should be no other references to the SMgrRelation.
The short version:
* smgrclose() is now just an alias for smgrrelease(). It releases
resources, but doesn't destroy until EOX
* smgrdestroy() now frees memory, and should rarely be used.
Existing code should be unaffected, but it is now possible for code that
has an SMgrRelation object to use it repeatedly during a transaction as
long as the storage hasn't been physically dropped. Such code would
normally hold a lock on the relation.
This also replaces the "ownership" mechanism of SMgrRelations with a
pin counter. An SMgrRelation can now be "pinned", which prevents it
from being destroyed at end of transaction. There can be multiple pins
on the same SMgrRelation. In practice, the pin mechanism is only used
by the relcache, so there cannot be more than one pin on the same
SMgrRelation. Except with swap_relation_files XXX
Author: Thomas Munro, Heikki Linnakangas
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGJ8NTvqLHz6dqbQnt2c8XCki4r2QvXjBQcXpVwxTY_pvA@mail.gmail.com
2 years ago
|
|
|
/* Free all smgr objects, as CheckpointerMain() normally would. */
|
|
|
|
|
smgrdestroyall();
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Atomically set the request flags, and take a snapshot of the counters.
|
|
|
|
|
* When we see ckpt_started > old_started, we know the flags we set here
|
|
|
|
|
* have been seen by checkpointer.
|
|
|
|
|
*
|
|
|
|
|
* Note that we OR the flags with any existing flags, to avoid overriding
|
|
|
|
|
* a "stronger" request by another backend. The flag senses must be
|
|
|
|
|
* chosen to make this work!
|
|
|
|
|
*/
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
old_failed = CheckpointerShmem->ckpt_failed;
|
|
|
|
|
old_started = CheckpointerShmem->ckpt_started;
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED);
|
|
|
|
|
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Set checkpointer's latch to request checkpoint. It's possible that the
|
|
|
|
|
* checkpointer hasn't started yet, so we will retry a few times if
|
|
|
|
|
* needed. (Actually, more than a few times, since on slow or overloaded
|
|
|
|
|
* buildfarm machines, it's been observed that the checkpointer can take
|
|
|
|
|
* several seconds to start.) However, if not told to wait for the
|
|
|
|
|
* checkpoint to occur, we consider failure to set the latch to be
|
|
|
|
|
* nonfatal and merely LOG it. The checkpointer should see the request
|
|
|
|
|
* when it does start, with or without the SetLatch().
|
|
|
|
|
*/
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
#define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */
|
|
|
|
|
for (ntries = 0;; ntries++)
|
|
|
|
|
{
|
|
|
|
|
volatile PROC_HDR *procglobal = ProcGlobal;
|
|
|
|
|
ProcNumber checkpointerProc = procglobal->checkpointerProc;
|
|
|
|
|
|
|
|
|
|
if (checkpointerProc == INVALID_PROC_NUMBER)
|
|
|
|
|
{
|
Make checkpoint requests more robust.
Commit 6f6a6d8b1 introduced a delay of up to 2 seconds if we're trying
to request a checkpoint but the checkpointer hasn't started yet (or,
much less likely, our kill() call fails). However buildfarm experience
shows that that's not quite enough for slow or heavily-loaded machines.
There's no good reason to assume that the checkpointer won't start
eventually, so we may as well make the timeout much longer, say 60 sec.
However, if the caller didn't say CHECKPOINT_WAIT, it seems like a bad
idea to be waiting at all, much less for as long as 60 sec. We can
remove the need for that, and make this whole thing more robust, by
adjusting the code so that the existence of a pending checkpoint
request is clear from the contents of shared memory, and making sure
that the checkpointer process will notice it at startup even if it did
not get a signal. In this way there's no need for a non-CHECKPOINT_WAIT
call to wait at all; if it can't send the signal, it can nonetheless
assume that the checkpointer will eventually service the request.
A potential downside of this change is that "kill -INT" on the checkpointer
process is no longer enough to trigger a checkpoint, should anyone be
relying on something so hacky. But there's no obvious reason to do it
like that rather than issuing a plain old CHECKPOINT command, so we'll
assume that nobody is. There doesn't seem to be a way to preserve this
undocumented quasi-feature without introducing race conditions.
Since a principal reason for messing with this is to prevent intermittent
buildfarm failures, back-patch to all supported branches.
Discussion: https://postgr.es/m/27830.1552752475@sss.pgh.pa.us
7 years ago
|
|
|
if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT))
|
|
|
|
|
{
|
|
|
|
|
elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
|
|
|
|
|
"could not notify checkpoint: checkpointer is not running");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch);
|
|
|
|
|
/* notified successfully */
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
|
pg_usleep(100000L); /* wait 0.1 sec, then retry */
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If requested, wait for completion. We detect completion according to
|
|
|
|
|
* the algorithm given above.
|
|
|
|
|
*/
|
|
|
|
|
if (flags & CHECKPOINT_WAIT)
|
|
|
|
|
{
|
|
|
|
|
int new_started,
|
|
|
|
|
new_failed;
|
|
|
|
|
|
|
|
|
|
/* Wait for a new checkpoint to start. */
|
|
|
|
|
ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv);
|
|
|
|
|
for (;;)
|
|
|
|
|
{
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
new_started = CheckpointerShmem->ckpt_started;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
if (new_started != old_started)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
ConditionVariableSleep(&CheckpointerShmem->start_cv,
|
|
|
|
|
WAIT_EVENT_CHECKPOINT_START);
|
|
|
|
|
}
|
|
|
|
|
ConditionVariableCancelSleep();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We are waiting for ckpt_done >= new_started, in a modulo sense.
|
|
|
|
|
*/
|
|
|
|
|
ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv);
|
|
|
|
|
for (;;)
|
|
|
|
|
{
|
|
|
|
|
int new_done;
|
|
|
|
|
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
new_done = CheckpointerShmem->ckpt_done;
|
|
|
|
|
new_failed = CheckpointerShmem->ckpt_failed;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
if (new_done - new_started >= 0)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
ConditionVariableSleep(&CheckpointerShmem->done_cv,
|
|
|
|
|
WAIT_EVENT_CHECKPOINT_DONE);
|
|
|
|
|
}
|
|
|
|
|
ConditionVariableCancelSleep();
|
|
|
|
|
|
|
|
|
|
if (new_failed != old_failed)
|
|
|
|
|
ereport(ERROR,
|
|
|
|
|
(errmsg("checkpoint request failed"),
|
|
|
|
|
errhint("Consult recent messages in the server log for details.")));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ForwardSyncRequest
|
|
|
|
|
* Forward a file-fsync request from a backend to the checkpointer
|
|
|
|
|
*
|
|
|
|
|
* Whenever a backend is compelled to write directly to a relation
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
* (which should be seldom, if the background writer is getting its job done),
|
|
|
|
|
* the backend calls this routine to pass over knowledge that the relation
|
|
|
|
|
* is dirty and must be fsync'd before next checkpoint. We also use this
|
|
|
|
|
* opportunity to count such writes for statistical purposes.
|
|
|
|
|
*
|
|
|
|
|
* To avoid holding the lock for longer than necessary, we normally write
|
|
|
|
|
* to the requests[] queue without checking for duplicates. The checkpointer
|
|
|
|
|
* will have to eliminate dups internally anyway. However, if we discover
|
|
|
|
|
* that the queue is full, we make a pass over the entire queue to compact
|
|
|
|
|
* it. This is somewhat expensive, but the alternative is for the backend
|
|
|
|
|
* to perform its own fsync, which is far more expensive in practice. It
|
|
|
|
|
* is theoretically possible a backend fsync might still be necessary, if
|
|
|
|
|
* the queue is full and contains no duplicate entries. In that case, we
|
|
|
|
|
* let the backend know by returning false.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
|
|
|
|
|
{
|
|
|
|
|
CheckpointerRequest *request;
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
bool too_full;
|
|
|
|
|
|
|
|
|
|
if (!IsUnderPostmaster)
|
|
|
|
|
return false; /* probably shouldn't even get here */
|
|
|
|
|
|
Fix management of pendingOpsTable in auxiliary processes.
mdinit() was misusing IsBootstrapProcessingMode() to decide whether to
create an fsync pending-operations table in the current process. This led
to creating a table not only in the startup and checkpointer processes as
intended, but also in the bgwriter process, not to mention other auxiliary
processes such as walwriter and walreceiver. Creation of the table in the
bgwriter is fatal, because it absorbs fsync requests that should have gone
to the checkpointer; instead they just sit in bgwriter local memory and are
never acted on. So writes performed by the bgwriter were not being fsync'd
which could result in data loss after an OS crash. I think there is no
live bug with respect to walwriter and walreceiver because those never
perform any writes of shared buffers; but the potential is there for
future breakage in those processes too.
To fix, make AuxiliaryProcessMain() export the current process's
AuxProcType as a global variable, and then make mdinit() test directly for
the types of aux process that should have a pendingOpsTable. Having done
that, we might as well also get rid of the random bool flags such as
am_walreceiver that some of the aux processes had grown. (Note that we
could not have fixed the bug by examining those variables in mdinit(),
because it's called from BaseInit() which is run by AuxiliaryProcessMain()
before entering any of the process-type-specific code.)
Back-patch to 9.2, where the problem was introduced by the split-up of
bgwriter and checkpointer processes. The bogus pendingOpsTable exists
in walwriter and walreceiver processes in earlier branches, but absent
any evidence that it causes actual problems there, I'll leave the older
branches alone.
14 years ago
|
|
|
if (AmCheckpointerProcess())
|
|
|
|
|
elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
|
|
|
|
|
|
|
|
|
|
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If the checkpointer isn't running or the request queue is full, the
|
|
|
|
|
* backend will have to perform its own fsync request. But before forcing
|
|
|
|
|
* that to happen, we can try to compact the request queue.
|
|
|
|
|
*/
|
|
|
|
|
if (CheckpointerShmem->checkpointer_pid == 0 ||
|
|
|
|
|
(CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
!CompactCheckpointerRequestQueue()))
|
|
|
|
|
{
|
|
|
|
|
LWLockRelease(CheckpointerCommLock);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
/* OK, insert request */
|
|
|
|
|
request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
|
|
|
|
|
request->ftag = *ftag;
|
|
|
|
|
request->type = type;
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
/* If queue is more than half full, nudge the checkpointer to empty it */
|
|
|
|
|
too_full = (CheckpointerShmem->num_requests >=
|
|
|
|
|
CheckpointerShmem->max_requests / 2);
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
LWLockRelease(CheckpointerCommLock);
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
/* ... but not till after we release the lock */
|
|
|
|
|
if (too_full)
|
|
|
|
|
{
|
|
|
|
|
volatile PROC_HDR *procglobal = ProcGlobal;
|
|
|
|
|
ProcNumber checkpointerProc = procglobal->checkpointerProc;
|
|
|
|
|
|
|
|
|
|
if (checkpointerProc != INVALID_PROC_NUMBER)
|
|
|
|
|
SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch);
|
|
|
|
|
}
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* CompactCheckpointerRequestQueue
|
|
|
|
|
* Remove duplicates from the request queue to avoid backend fsyncs.
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
* Returns "true" if any entries were removed.
|
|
|
|
|
*
|
|
|
|
|
* Although a full fsync request queue is not common, it can lead to severe
|
|
|
|
|
* performance problems when it does happen. So far, this situation has
|
|
|
|
|
* only been observed to occur when the system is under heavy write load,
|
|
|
|
|
* and especially during the "sync" phase of a checkpoint. Without this
|
|
|
|
|
* logic, each backend begins doing an fsync for every block written, which
|
|
|
|
|
* gets very expensive and can slow down the whole system.
|
|
|
|
|
*
|
|
|
|
|
* Trying to do this every time the queue is full could lose if there
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
* aren't any removable entries. But that should be vanishingly rare in
|
|
|
|
|
* practice: there's one queue entry per shared buffer.
|
|
|
|
|
*/
|
|
|
|
|
static bool
|
Reduce idle power consumption of walwriter and checkpointer processes.
This patch modifies the walwriter process so that, when it has not found
anything useful to do for many consecutive wakeup cycles, it extends its
sleep time to reduce the server's idle power consumption. It reverts to
normal as soon as it's done any successful flushes. It's still true that
during any async commit, backends check for completed, unflushed pages of
WAL and signal the walwriter if there are any; so that in practice the
walwriter can get awakened and returned to normal operation sooner than the
sleep time might suggest.
Also, improve the checkpointer so that it uses a latch and a computed delay
time to not wake up at all except when it has something to do, replacing a
previous hardcoded 0.5 sec wakeup cycle. This also is primarily useful for
reducing the server's power consumption when idle.
In passing, get rid of the dedicated latch for signaling the walwriter in
favor of using its procLatch, since that comports better with possible
generic signal handlers using that latch. Also, fix a pre-existing bug
with failure to save/restore errno in walwriter's signal handlers.
Peter Geoghegan, somewhat simplified by Tom
14 years ago
|
|
|
CompactCheckpointerRequestQueue(void)
|
|
|
|
|
{
|
|
|
|
|
struct CheckpointerSlotMapping
|
|
|
|
|
{
|
|
|
|
|
CheckpointerRequest request;
|
|
|
|
|
int slot;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
int n,
|
|
|
|
|
preserve_count;
|
|
|
|
|
int num_skipped = 0;
|
|
|
|
|
HASHCTL ctl;
|
|
|
|
|
HTAB *htab;
|
|
|
|
|
bool *skip_slot;
|
|
|
|
|
|
|
|
|
|
/* must hold CheckpointerCommLock in exclusive mode */
|
|
|
|
|
Assert(LWLockHeldByMe(CheckpointerCommLock));
|
|
|
|
|
|
Fix bugs in MultiXact truncation
1. TruncateMultiXact() performs the SLRU truncations in a critical
section. Deleting the SLRU segments calls ForwardSyncRequest(), which
will try to compact the request queue if it's full
(CompactCheckpointerRequestQueue()). That in turn allocates memory,
which is not allowed in a critical section. Backtrace:
TRAP: failed Assert("CritSectionCount == 0 || (context)->allowInCritSection"), File: "../src/backend/utils/mmgr/mcxt.c", Line: 1353, PID: 920981
postgres: autovacuum worker template0(ExceptionalCondition+0x6e)[0x560a501e866e]
postgres: autovacuum worker template0(+0x5dce3d)[0x560a50217e3d]
postgres: autovacuum worker template0(ForwardSyncRequest+0x8e)[0x560a4ffec95e]
postgres: autovacuum worker template0(RegisterSyncRequest+0x2b)[0x560a50091eeb]
postgres: autovacuum worker template0(+0x187b0a)[0x560a4fdc2b0a]
postgres: autovacuum worker template0(SlruDeleteSegment+0x101)[0x560a4fdc2ab1]
postgres: autovacuum worker template0(TruncateMultiXact+0x2fb)[0x560a4fdbde1b]
postgres: autovacuum worker template0(vac_update_datfrozenxid+0x4b3)[0x560a4febd2f3]
postgres: autovacuum worker template0(+0x3adf66)[0x560a4ffe8f66]
postgres: autovacuum worker template0(AutoVacWorkerMain+0x3ed)[0x560a4ffe7c2d]
postgres: autovacuum worker template0(+0x3b1ead)[0x560a4ffecead]
postgres: autovacuum worker template0(+0x3b620e)[0x560a4fff120e]
postgres: autovacuum worker template0(+0x3b3fbb)[0x560a4ffeefbb]
postgres: autovacuum worker template0(+0x2f724e)[0x560a4ff3224e]
/lib/x86_64-linux-gnu/libc.so.6(+0x27c8a)[0x7f62cc642c8a]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x85)[0x7f62cc642d45]
postgres: autovacuum worker template0(_start+0x21)[0x560a4fd16f31]
To fix, bail out in CompactCheckpointerRequestQueue() without doing
anything, if it's called in a critical section. That covers the above
call path, as well as any other similar cases where
RegisterSyncRequest might be called in a critical section.
2. After fixing that, another problem became apparent: Autovacuum
process doing that truncation can deadlock with the checkpointer
process. TruncateMultiXact() sets "MyProc->delayChkptFlags |=
DELAY_CHKPT_START". If the sync request queue is full and cannot be
compacted, the process will repeatedly sleep and retry, until there is
room in the queue. However, if the checkpointer is trying to start a
checkpoint at the same time, and is waiting for the DELAY_CHKPT_START
processes to finish, the queue will never shrink.
More concretely, the autovacuum process is stuck here:
#0 0x00007fc934926dc3 in epoll_wait () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x000056220b24348b in WaitEventSetWaitBlock (set=0x56220c2e4b50, occurred_events=0x7ffe7856d040, nevents=1, cur_timeout=<optimized out>) at ../src/backend/storage/ipc/latch.c:1570
#2 WaitEventSetWait (set=0x56220c2e4b50, timeout=timeout@entry=10, occurred_events=<optimized out>, occurred_events@entry=0x7ffe7856d040, nevents=nevents@entry=1,
wait_event_info=wait_event_info@entry=150994949) at ../src/backend/storage/ipc/latch.c:1516
#3 0x000056220b243224 in WaitLatch (latch=<optimized out>, latch@entry=0x0, wakeEvents=wakeEvents@entry=40, timeout=timeout@entry=10, wait_event_info=wait_event_info@entry=150994949)
at ../src/backend/storage/ipc/latch.c:538
#4 0x000056220b26cf46 in RegisterSyncRequest (ftag=ftag@entry=0x7ffe7856d0a0, type=type@entry=SYNC_FORGET_REQUEST, retryOnError=true) at ../src/backend/storage/sync/sync.c:614
#5 0x000056220af9db0a in SlruInternalDeleteSegment (ctl=ctl@entry=0x56220b7beb60 <MultiXactMemberCtlData>, segno=segno@entry=11350) at ../src/backend/access/transam/slru.c:1495
#6 0x000056220af9dab1 in SlruDeleteSegment (ctl=ctl@entry=0x56220b7beb60 <MultiXactMemberCtlData>, segno=segno@entry=11350) at ../src/backend/access/transam/slru.c:1566
#7 0x000056220af98e1b in PerformMembersTruncation (oldestOffset=<optimized out>, newOldestOffset=<optimized out>) at ../src/backend/access/transam/multixact.c:3006
#8 TruncateMultiXact (newOldestMulti=newOldestMulti@entry=3221225472, newOldestMultiDB=newOldestMultiDB@entry=4) at ../src/backend/access/transam/multixact.c:3201
#9 0x000056220b098303 in vac_truncate_clog (frozenXID=749, minMulti=<optimized out>, lastSaneFrozenXid=749, lastSaneMinMulti=3221225472) at ../src/backend/commands/vacuum.c:1917
#10 vac_update_datfrozenxid () at ../src/backend/commands/vacuum.c:1760
#11 0x000056220b1c3f76 in do_autovacuum () at ../src/backend/postmaster/autovacuum.c:2550
#12 0x000056220b1c2c3d in AutoVacWorkerMain (startup_data=<optimized out>, startup_data_len=<optimized out>) at ../src/backend/postmaster/autovacuum.c:1569
and the checkpointer is stuck here:
#0 0x00007fc9348ebf93 in clock_nanosleep () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007fc9348fe353 in nanosleep () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x000056220b40ecb4 in pg_usleep (microsec=microsec@entry=10000) at ../src/port/pgsleep.c:50
#3 0x000056220afb43c3 in CreateCheckPoint (flags=flags@entry=108) at ../src/backend/access/transam/xlog.c:7098
#4 0x000056220b1c6e86 in CheckpointerMain (startup_data=<optimized out>, startup_data_len=<optimized out>) at ../src/backend/postmaster/checkpointer.c:464
To fix, add AbsorbSyncRequests() to the loops where the checkpointer
waits for DELAY_CHKPT_START or DELAY_CHKPT_COMPLETE operations to
finish.
Backpatch to v14. Before that, SLRU deletion didn't call
RegisterSyncRequest, which avoided this failure. I'm not sure if there
are other similar scenarios on older versions, but we haven't had
any such reports.
Discussion: https://www.postgresql.org/message-id/ccc66933-31c1-4f6a-bf4b-45fef0d4f22e@iki.fi
2 years ago
|
|
|
/* Avoid memory allocations in a critical section. */
|
|
|
|
|
if (CritSectionCount > 0)
|
|
|
|
|
return false;
|
|
|
|
|
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
/* Initialize skip_slot array */
|
|
|
|
|
skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests);
|
|
|
|
|
|
|
|
|
|
/* Initialize temporary hash table */
|
|
|
|
|
ctl.keysize = sizeof(CheckpointerRequest);
|
|
|
|
|
ctl.entrysize = sizeof(struct CheckpointerSlotMapping);
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
ctl.hcxt = CurrentMemoryContext;
|
|
|
|
|
|
|
|
|
|
htab = hash_create("CompactCheckpointerRequestQueue",
|
|
|
|
|
CheckpointerShmem->num_requests,
|
|
|
|
|
&ctl,
|
Improve hash_create's API for selecting simple-binary-key hash functions.
Previously, if you wanted anything besides C-string hash keys, you had to
specify a custom hashing function to hash_create(). Nearly all such
callers were specifying tag_hash or oid_hash; which is tedious, and rather
error-prone, since a caller could easily miss the opportunity to optimize
by using hash_uint32 when appropriate. Replace this with a design whereby
callers using simple binary-data keys just specify HASH_BLOBS and don't
need to mess with specific support functions. hash_create() itself will
take care of optimizing when the key size is four bytes.
This nets out saving a few hundred bytes of code space, and offers
a measurable performance improvement in tidbitmap.c (which was not
exploiting the opportunity to use hash_uint32 for its 4-byte keys).
There might be some wins elsewhere too, I didn't analyze closely.
In future we could look into offering a similar optimized hashing function
for 8-byte keys. Under this design that could be done in a centralized
and machine-independent fashion, whereas getting it right for keys of
platform-dependent sizes would've been notationally painful before.
For the moment, the old way still works fine, so as not to break source
code compatibility for loadable modules. Eventually we might want to
remove tag_hash and friends from the exported API altogether, since there's
no real need for them to be explicitly referenced from outside dynahash.c.
Teodor Sigaev and Tom Lane
11 years ago
|
|
|
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The basic idea here is that a request can be skipped if it's followed
|
|
|
|
|
* by a later, identical request. It might seem more sensible to work
|
|
|
|
|
* backwards from the end of the queue and check whether a request is
|
|
|
|
|
* *preceded* by an earlier, identical request, in the hopes of doing less
|
|
|
|
|
* copying. But that might change the semantics, if there's an
|
|
|
|
|
* intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it
|
|
|
|
|
* this way. It would be possible to be even smarter if we made the code
|
|
|
|
|
* below understand the specific semantics of such requests (it could blow
|
|
|
|
|
* away preceding entries that would end up being canceled anyhow), but
|
|
|
|
|
* it's not clear that the extra complexity would buy us anything.
|
|
|
|
|
*/
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
for (n = 0; n < CheckpointerShmem->num_requests; n++)
|
|
|
|
|
{
|
|
|
|
|
CheckpointerRequest *request;
|
|
|
|
|
struct CheckpointerSlotMapping *slotmap;
|
|
|
|
|
bool found;
|
|
|
|
|
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
/*
|
|
|
|
|
* We use the request struct directly as a hashtable key. This
|
|
|
|
|
* assumes that any padding bytes in the structs are consistently the
|
|
|
|
|
* same, which should be okay because we zeroed them in
|
Change internal RelFileNode references to RelFileNumber or RelFileLocator.
We have been using the term RelFileNode to refer to either (1) the
integer that is used to name the sequence of files for a certain relation
within the directory set aside for that tablespace/database combination;
or (2) that value plus the OIDs of the tablespace and database; or
occasionally (3) the whole series of files created for a relation
based on those values. Using the same name for more than one thing is
confusing.
Replace RelFileNode with RelFileNumber when we're talking about just the
single number, i.e. (1) from above, and with RelFileLocator when we're
talking about all the things that are needed to locate a relation's files
on disk, i.e. (2) from above. In the places where we refer to (3) as
a relfilenode, instead refer to "relation storage".
Since there is a ton of SQL code in the world that knows about
pg_class.relfilenode, don't change the name of that column, or of other
SQL-facing things that derive their name from it.
On the other hand, do adjust closely-related internal terminology. For
example, the structure member names dbNode and spcNode appear to be
derived from the fact that the structure itself was called RelFileNode,
so change those to dbOid and spcOid. Likewise, various variables with
names like rnode and relnode get renamed appropriately, according to
how they're being used in context.
Hopefully, this is clearer than before. It is also preparation for
future patches that intend to widen the relfilenumber fields from its
current width of 32 bits. Variables that store a relfilenumber are now
declared as type RelFileNumber rather than type Oid; right now, these
are the same, but that can now more easily be changed.
Dilip Kumar, per an idea from me. Reviewed also by Andres Freund.
I fixed some whitespace issues, changed a couple of words in a
comment, and made one other minor correction.
Discussion: http://postgr.es/m/CA+TgmoamOtXbVAQf9hWFzonUo6bhhjS6toZQd7HZ-pmojtAmag@mail.gmail.com
Discussion: http://postgr.es/m/CA+Tgmobp7+7kmi4gkq7Y+4AM9fTvL+O1oQ4-5gFTT+6Ng-dQ=g@mail.gmail.com
Discussion: http://postgr.es/m/CAFiTN-vTe79M8uDH1yprOU64MNFE+R3ODRuA+JWf27JbhY4hJw@mail.gmail.com
4 years ago
|
|
|
* CheckpointerShmemInit. Note also that RelFileLocator had better
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
* contain no pad bytes.
|
|
|
|
|
*/
|
|
|
|
|
request = &CheckpointerShmem->requests[n];
|
|
|
|
|
slotmap = hash_search(htab, request, HASH_ENTER, &found);
|
|
|
|
|
if (found)
|
|
|
|
|
{
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
/* Duplicate, so mark the previous occurrence as skippable */
|
|
|
|
|
skip_slot[slotmap->slot] = true;
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
num_skipped++;
|
|
|
|
|
}
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
/* Remember slot containing latest occurrence of this request value */
|
|
|
|
|
slotmap->slot = n;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Done with the hash table. */
|
|
|
|
|
hash_destroy(htab);
|
|
|
|
|
|
|
|
|
|
/* If no duplicates, we're out of luck. */
|
|
|
|
|
if (!num_skipped)
|
|
|
|
|
{
|
|
|
|
|
pfree(skip_slot);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We found some duplicates; remove them. */
|
Improve coding around the fsync request queue.
In all branches back to 8.3, this patch fixes a questionable assumption in
CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue that there are
no uninitialized pad bytes in the request queue structs. This would only
cause trouble if (a) there were such pad bytes, which could happen in 8.4
and up if the compiler makes enum ForkNumber narrower than 32 bits, but
otherwise would require not-currently-planned changes in the widths of
other typedefs; and (b) the kernel has not uniformly initialized the
contents of shared memory to zeroes. Still, it seems a tad risky, and we
can easily remove any risk by pre-zeroing the request array for ourselves.
In addition to that, we need to establish a coding rule that struct
RelFileNode can't contain any padding bytes, since such structs are copied
into the request array verbatim. (There are other places that are assuming
this anyway, it turns out.)
In 9.1 and up, the risk was a bit larger because we were also effectively
assuming that struct RelFileNodeBackend contained no pad bytes, and with
fields of different types in there, that would be much easier to break.
However, there is no good reason to ever transmit fsync or delete requests
for temp files to the bgwriter/checkpointer, so we can revert the request
structs to plain RelFileNode, getting rid of the padding risk and saving
some marginal number of bytes and cycles in fsync queue manipulation while
we are at it. The savings might be more than marginal during deletion of
a temp relation, because the old code transmitted an entirely useless but
nonetheless expensive-to-process ForgetRelationFsync request to the
background process, and also had the background process perform the file
deletion even though that can safely be done immediately.
In addition, make some cleanup of nearby comments and small improvements to
the code in CompactCheckpointerRequestQueue/CompactBgwriterRequestQueue.
14 years ago
|
|
|
preserve_count = 0;
|
|
|
|
|
for (n = 0; n < CheckpointerShmem->num_requests; n++)
|
|
|
|
|
{
|
|
|
|
|
if (skip_slot[n])
|
|
|
|
|
continue;
|
|
|
|
|
CheckpointerShmem->requests[preserve_count++] = CheckpointerShmem->requests[n];
|
|
|
|
|
}
|
|
|
|
|
ereport(DEBUG1,
|
|
|
|
|
(errmsg_internal("compacted fsync request queue from %d entries to %d entries",
|
|
|
|
|
CheckpointerShmem->num_requests, preserve_count)));
|
|
|
|
|
CheckpointerShmem->num_requests = preserve_count;
|
|
|
|
|
|
|
|
|
|
/* Cleanup. */
|
|
|
|
|
pfree(skip_slot);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* AbsorbSyncRequests
|
|
|
|
|
* Retrieve queued sync requests and pass them to sync mechanism.
|
|
|
|
|
*
|
|
|
|
|
* This is exported because it must be called during CreateCheckPoint;
|
|
|
|
|
* we have to be sure we have accepted all pending requests just before
|
|
|
|
|
* we start fsync'ing. Since CreateCheckPoint sometimes runs in
|
|
|
|
|
* non-checkpointer processes, do nothing if not checkpointer.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
AbsorbSyncRequests(void)
|
|
|
|
|
{
|
|
|
|
|
CheckpointerRequest *requests = NULL;
|
|
|
|
|
CheckpointerRequest *request;
|
|
|
|
|
int n;
|
|
|
|
|
|
Fix management of pendingOpsTable in auxiliary processes.
mdinit() was misusing IsBootstrapProcessingMode() to decide whether to
create an fsync pending-operations table in the current process. This led
to creating a table not only in the startup and checkpointer processes as
intended, but also in the bgwriter process, not to mention other auxiliary
processes such as walwriter and walreceiver. Creation of the table in the
bgwriter is fatal, because it absorbs fsync requests that should have gone
to the checkpointer; instead they just sit in bgwriter local memory and are
never acted on. So writes performed by the bgwriter were not being fsync'd
which could result in data loss after an OS crash. I think there is no
live bug with respect to walwriter and walreceiver because those never
perform any writes of shared buffers; but the potential is there for
future breakage in those processes too.
To fix, make AuxiliaryProcessMain() export the current process's
AuxProcType as a global variable, and then make mdinit() test directly for
the types of aux process that should have a pendingOpsTable. Having done
that, we might as well also get rid of the random bool flags such as
am_walreceiver that some of the aux processes had grown. (Note that we
could not have fixed the bug by examining those variables in mdinit(),
because it's called from BaseInit() which is run by AuxiliaryProcessMain()
before entering any of the process-type-specific code.)
Back-patch to 9.2, where the problem was introduced by the split-up of
bgwriter and checkpointer processes. The bogus pendingOpsTable exists
in walwriter and walreceiver processes in earlier branches, but absent
any evidence that it causes actual problems there, I'll leave the older
branches alone.
14 years ago
|
|
|
if (!AmCheckpointerProcess())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We try to avoid holding the lock for a long time by copying the request
|
|
|
|
|
* array, and processing the requests after releasing the lock.
|
|
|
|
|
*
|
|
|
|
|
* Once we have cleared the requests from shared memory, we have to PANIC
|
|
|
|
|
* if we then fail to absorb them (eg, because our hashtable runs out of
|
|
|
|
|
* memory). This is because the system cannot run safely if we are unable
|
|
|
|
|
* to fsync what we have been told to fsync. Fortunately, the hashtable
|
|
|
|
|
* is so small that the problem is quite unlikely to arise in practice.
|
|
|
|
|
*/
|
|
|
|
|
n = CheckpointerShmem->num_requests;
|
|
|
|
|
if (n > 0)
|
|
|
|
|
{
|
|
|
|
|
requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest));
|
|
|
|
|
memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
|
|
CheckpointerShmem->num_requests = 0;
|
|
|
|
|
|
|
|
|
|
LWLockRelease(CheckpointerCommLock);
|
|
|
|
|
|
|
|
|
|
for (request = requests; n > 0; request++, n--)
|
|
|
|
|
RememberSyncRequest(&request->ftag, request->type);
|
|
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
|
|
|
|
|
|
if (requests)
|
|
|
|
|
pfree(requests);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Update any shared memory configurations based on config parameters
|
|
|
|
|
*/
|
|
|
|
|
static void
|
|
|
|
|
UpdateSharedMemoryConfig(void)
|
|
|
|
|
{
|
|
|
|
|
/* update global shmem state for sync rep */
|
|
|
|
|
SyncRepUpdateSyncStandbysDefined();
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If full_page_writes has been changed by SIGHUP, we update it in shared
|
|
|
|
|
* memory and write an XLOG_FPW_CHANGE record.
|
|
|
|
|
*/
|
|
|
|
|
UpdateFullPageWrites();
|
|
|
|
|
|
|
|
|
|
elog(DEBUG2, "checkpointer updated shared memory configuration values");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* FirstCallSinceLastCheckpoint allows a process to take an action once
|
|
|
|
|
* per checkpoint cycle by asynchronously checking for checkpoint completion.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
FirstCallSinceLastCheckpoint(void)
|
|
|
|
|
{
|
|
|
|
|
static int ckpt_done = 0;
|
|
|
|
|
int new_done;
|
|
|
|
|
bool FirstCall = false;
|
|
|
|
|
|
|
|
|
|
SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
new_done = CheckpointerShmem->ckpt_done;
|
|
|
|
|
SpinLockRelease(&CheckpointerShmem->ckpt_lck);
|
|
|
|
|
|
|
|
|
|
if (new_done != ckpt_done)
|
|
|
|
|
FirstCall = true;
|
|
|
|
|
|
|
|
|
|
ckpt_done = new_done;
|
|
|
|
|
|
|
|
|
|
return FirstCall;
|
|
|
|
|
}
|