|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* spgvacuum.c
|
|
|
|
* vacuum for SP-GiST
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
|
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/access/spgist/spgvacuum.c
|
|
|
|
*
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "access/genam.h"
|
|
|
|
#include "access/spgist_private.h"
|
|
|
|
#include "access/spgxlog.h"
|
|
|
|
#include "access/transam.h"
|
|
|
|
#include "access/xloginsert.h"
|
|
|
|
#include "commands/vacuum.h"
|
|
|
|
#include "miscadmin.h"
|
|
|
|
#include "storage/bufmgr.h"
|
|
|
|
#include "storage/indexfsm.h"
|
|
|
|
#include "storage/lmgr.h"
|
|
|
|
#include "utils/snapmgr.h"
|
|
|
|
|
|
|
|
|
|
|
|
/* Entry in pending-list of TIDs we need to revisit */
|
|
|
|
typedef struct spgVacPendingItem
|
|
|
|
{
|
|
|
|
ItemPointerData tid; /* redirection target to visit */
|
|
|
|
bool done; /* have we dealt with this? */
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
struct spgVacPendingItem *next; /* list link */
|
|
|
|
} spgVacPendingItem;
|
|
|
|
|
|
|
|
/* Local state for vacuum operations */
|
|
|
|
typedef struct spgBulkDeleteState
|
|
|
|
{
|
|
|
|
/* Parameters passed in to spgvacuumscan */
|
|
|
|
IndexVacuumInfo *info;
|
|
|
|
IndexBulkDeleteResult *stats;
|
|
|
|
IndexBulkDeleteCallback callback;
|
|
|
|
void *callback_state;
|
|
|
|
|
|
|
|
/* Additional working state */
|
|
|
|
SpGistState spgstate; /* for SPGiST operations that need one */
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
|
|
|
|
TransactionId myXmin; /* for detecting newly-added redirects */
|
|
|
|
BlockNumber lastFilledBlock; /* last non-deletable block */
|
|
|
|
} spgBulkDeleteState;
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add TID to pendingList, but only if not already present.
|
|
|
|
*
|
|
|
|
* Note that new items are always appended at the end of the list; this
|
|
|
|
* ensures that scans of the list don't miss items added during the scan.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
|
|
|
|
{
|
|
|
|
spgVacPendingItem *pitem;
|
|
|
|
spgVacPendingItem **listLink;
|
|
|
|
|
|
|
|
/* search the list for pre-existing entry */
|
|
|
|
listLink = &bds->pendingList;
|
|
|
|
while (*listLink != NULL)
|
|
|
|
{
|
|
|
|
pitem = *listLink;
|
|
|
|
if (ItemPointerEquals(tid, &pitem->tid))
|
|
|
|
return; /* already in list, do nothing */
|
|
|
|
listLink = &pitem->next;
|
|
|
|
}
|
|
|
|
/* not there, so append new entry */
|
|
|
|
pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
|
|
|
|
pitem->tid = *tid;
|
|
|
|
pitem->done = false;
|
|
|
|
pitem->next = NULL;
|
|
|
|
*listLink = pitem;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear pendingList
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spgClearPendingList(spgBulkDeleteState *bds)
|
|
|
|
{
|
|
|
|
spgVacPendingItem *pitem;
|
|
|
|
spgVacPendingItem *nitem;
|
|
|
|
|
|
|
|
for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
|
|
|
|
{
|
|
|
|
nitem = pitem->next;
|
|
|
|
/* All items in list should have been dealt with */
|
|
|
|
Assert(pitem->done);
|
|
|
|
pfree(pitem);
|
|
|
|
}
|
|
|
|
bds->pendingList = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Vacuum a regular (non-root) leaf page
|
|
|
|
*
|
|
|
|
* We must delete tuples that are targeted for deletion by the VACUUM,
|
|
|
|
* but not move any tuples that are referenced by outside links; we assume
|
|
|
|
* those are the ones that are heads of chains.
|
|
|
|
*
|
|
|
|
* If we find a REDIRECT that was made by a concurrently-running transaction,
|
|
|
|
* we must add its target TID to pendingList. (We don't try to visit the
|
|
|
|
* target immediately, first because we don't want VACUUM locking more than
|
|
|
|
* one buffer at a time, and second because the duplicate-filtering logic
|
|
|
|
* in spgAddPendingTID is useful to ensure we can't get caught in an infinite
|
|
|
|
* loop in the face of continuous concurrent insertions.)
|
|
|
|
*
|
|
|
|
* If forPending is true, we are examining the page as a consequence of
|
|
|
|
* chasing a redirect link, not as part of the normal sequential scan.
|
|
|
|
* We still vacuum the page normally, but we don't increment the stats
|
|
|
|
* about live tuples; else we'd double-count those tuples, since the page
|
|
|
|
* has been or will be visited in the sequential scan as well.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
|
|
|
|
bool forPending)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
spgxlogVacuumLeaf xlrec;
|
|
|
|
OffsetNumber toDead[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber toPlaceholder[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber moveSrc[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber moveDest[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber chainSrc[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber chainDest[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber predecessor[MaxIndexTuplesPerPage + 1];
|
|
|
|
bool deletable[MaxIndexTuplesPerPage + 1];
|
|
|
|
int nDeletable;
|
|
|
|
OffsetNumber i,
|
|
|
|
max = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
memset(predecessor, 0, sizeof(predecessor));
|
|
|
|
memset(deletable, 0, sizeof(deletable));
|
|
|
|
nDeletable = 0;
|
|
|
|
|
|
|
|
/* Scan page, identify tuples to delete, accumulate stats */
|
|
|
|
for (i = FirstOffsetNumber; i <= max; i++)
|
|
|
|
{
|
|
|
|
SpGistLeafTuple lt;
|
|
|
|
|
|
|
|
lt = (SpGistLeafTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, i));
|
|
|
|
if (lt->tupstate == SPGIST_LIVE)
|
|
|
|
{
|
|
|
|
Assert(ItemPointerIsValid(<->heapPtr));
|
|
|
|
|
|
|
|
if (bds->callback(<->heapPtr, bds->callback_state))
|
|
|
|
{
|
|
|
|
bds->stats->tuples_removed += 1;
|
|
|
|
deletable[i] = true;
|
|
|
|
nDeletable++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!forPending)
|
|
|
|
bds->stats->num_index_tuples += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Form predecessor map, too */
|
|
|
|
if (SGLT_GET_NEXTOFFSET(lt) != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
/* paranoia about corrupted chain links */
|
|
|
|
if (SGLT_GET_NEXTOFFSET(lt) < FirstOffsetNumber ||
|
|
|
|
SGLT_GET_NEXTOFFSET(lt) > max ||
|
|
|
|
predecessor[SGLT_GET_NEXTOFFSET(lt)] != InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"",
|
|
|
|
BufferGetBlockNumber(buffer),
|
|
|
|
RelationGetRelationName(index));
|
|
|
|
predecessor[SGLT_GET_NEXTOFFSET(lt)] = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (lt->tupstate == SPGIST_REDIRECT)
|
|
|
|
{
|
|
|
|
SpGistDeadTuple dt = (SpGistDeadTuple) lt;
|
|
|
|
|
|
|
|
Assert(SGLT_GET_NEXTOFFSET(dt) == InvalidOffsetNumber);
|
|
|
|
Assert(ItemPointerIsValid(&dt->pointer));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add target TID to pending list if the redirection could have
|
Fix insertion of SP-GiST REDIRECT tuples during REINDEX CONCURRENTLY.
Reconstruction of an SP-GiST index by REINDEX CONCURRENTLY may
insert some REDIRECT tuples. This will typically happen in
a transaction that lacks an XID, which leads either to assertion
failure in spgFormDeadTuple or to insertion of a REDIRECT tuple
with zero xid. The latter's not good either, since eventually
VACUUM will apply GlobalVisTestIsRemovableXid() to the zero xid,
resulting in either an assertion failure or a garbage answer.
In practice, since REINDEX CONCURRENTLY locks out index scans
till it's done, it doesn't matter whether it inserts REDIRECTs
or PLACEHOLDERs; and likewise it doesn't matter how soon VACUUM
reduces such a REDIRECT to a PLACEHOLDER. So in non-assert builds
there's no observable problem here, other than perhaps a little
index bloat. But it's not behaving as intended.
To fix, remove the failing Assert in spgFormDeadTuple, acknowledging
that we might sometimes insert a zero XID; and guard VACUUM's
GlobalVisTestIsRemovableXid() call with a test for valid XID,
ensuring that we'll reduce such a REDIRECT the first time VACUUM
sees it. (Versions before v14 use TransactionIdPrecedes here,
which won't fail on zero xid, so they really have no bug at all
in non-assert builds.)
Another solution could be to not create REDIRECTs at all during
REINDEX CONCURRENTLY, making the relevant code paths treat that
case like index build (which likewise knows that no concurrent
index scans can be happening). That would allow restoring the
Assert in spgFormDeadTuple, but we'd still need the VACUUM change
because redirection tuples with zero xid may be out there already.
But there doesn't seem to be a nice way for spginsert() to tell that
it's being called in REINDEX CONCURRENTLY without some API changes,
so we'll leave that as a possible future improvement.
In HEAD, also rename the SpGistState.myXid field to redirectXid,
which seems less misleading (since it might not in fact be our
transaction's XID) and is certainly less uninformatively generic.
Per bug #18499 from Alexander Lakhin. Back-patch to all supported
branches.
Discussion: https://postgr.es/m/18499-8a519c280f956480@postgresql.org
1 year ago
|
|
|
* happened since VACUUM started. (If xid is invalid, assume it
|
|
|
|
* must have happened before VACUUM started, since REINDEX
|
|
|
|
* CONCURRENTLY locks out VACUUM.)
|
|
|
|
*
|
|
|
|
* Note: we could make a tighter test by seeing if the xid is
|
|
|
|
* "running" according to the active snapshot; but snapmgr.c
|
|
|
|
* doesn't currently export a suitable API, and it's not entirely
|
|
|
|
* clear that a tighter test is worth the cycles anyway.
|
|
|
|
*/
|
|
|
|
if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
|
|
|
|
spgAddPendingTID(bds, &dt->pointer);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Assert(SGLT_GET_NEXTOFFSET(lt) == InvalidOffsetNumber);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nDeletable == 0)
|
|
|
|
return; /* nothing more to do */
|
|
|
|
|
|
|
|
/*----------
|
|
|
|
* Figure out exactly what we have to do. We do this separately from
|
|
|
|
* actually modifying the page, mainly so that we have a representation
|
|
|
|
* that can be dumped into WAL and then the replay code can do exactly
|
|
|
|
* the same thing. The output of this step consists of six arrays
|
|
|
|
* describing four kinds of operations, to be performed in this order:
|
|
|
|
*
|
|
|
|
* toDead[]: tuple numbers to be replaced with DEAD tuples
|
|
|
|
* toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples
|
|
|
|
* moveSrc[]: tuple numbers that need to be relocated to another offset
|
|
|
|
* (replacing the tuple there) and then replaced with PLACEHOLDER tuples
|
|
|
|
* moveDest[]: new locations for moveSrc tuples
|
|
|
|
* chainSrc[]: tuple numbers whose chain links (nextOffset) need updates
|
|
|
|
* chainDest[]: new values of nextOffset for chainSrc members
|
|
|
|
*
|
|
|
|
* It's easiest to figure out what we have to do by processing tuple
|
|
|
|
* chains, so we iterate over all the tuples (not just the deletable
|
|
|
|
* ones!) to identify chain heads, then chase down each chain and make
|
|
|
|
* work item entries for deletable tuples within the chain.
|
|
|
|
*----------
|
|
|
|
*/
|
|
|
|
xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0;
|
|
|
|
|
|
|
|
for (i = FirstOffsetNumber; i <= max; i++)
|
|
|
|
{
|
|
|
|
SpGistLeafTuple head;
|
|
|
|
bool interveningDeletable;
|
|
|
|
OffsetNumber prevLive;
|
|
|
|
OffsetNumber j;
|
|
|
|
|
|
|
|
head = (SpGistLeafTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, i));
|
|
|
|
if (head->tupstate != SPGIST_LIVE)
|
|
|
|
continue; /* can't be a chain member */
|
|
|
|
if (predecessor[i] != 0)
|
|
|
|
continue; /* not a chain head */
|
|
|
|
|
|
|
|
/* initialize ... */
|
|
|
|
interveningDeletable = false;
|
|
|
|
prevLive = deletable[i] ? InvalidOffsetNumber : i;
|
|
|
|
|
|
|
|
/* scan down the chain ... */
|
|
|
|
j = SGLT_GET_NEXTOFFSET(head);
|
|
|
|
while (j != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
SpGistLeafTuple lt;
|
|
|
|
|
|
|
|
lt = (SpGistLeafTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, j));
|
|
|
|
if (lt->tupstate != SPGIST_LIVE)
|
|
|
|
{
|
|
|
|
/* all tuples in chain should be live */
|
|
|
|
elog(ERROR, "unexpected SPGiST tuple state: %d",
|
|
|
|
lt->tupstate);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (deletable[j])
|
|
|
|
{
|
|
|
|
/* This tuple should be replaced by a placeholder */
|
|
|
|
toPlaceholder[xlrec.nPlaceholder] = j;
|
|
|
|
xlrec.nPlaceholder++;
|
|
|
|
/* previous live tuple's chain link will need an update */
|
|
|
|
interveningDeletable = true;
|
|
|
|
}
|
|
|
|
else if (prevLive == InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* This is the first live tuple in the chain. It has to move
|
|
|
|
* to the head position.
|
|
|
|
*/
|
|
|
|
moveSrc[xlrec.nMove] = j;
|
|
|
|
moveDest[xlrec.nMove] = i;
|
|
|
|
xlrec.nMove++;
|
|
|
|
/* Chain updates will be applied after the move */
|
|
|
|
prevLive = i;
|
|
|
|
interveningDeletable = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Second or later live tuple. Arrange to re-chain it to the
|
|
|
|
* previous live one, if there was a gap.
|
|
|
|
*/
|
|
|
|
if (interveningDeletable)
|
|
|
|
{
|
|
|
|
chainSrc[xlrec.nChain] = prevLive;
|
|
|
|
chainDest[xlrec.nChain] = j;
|
|
|
|
xlrec.nChain++;
|
|
|
|
}
|
|
|
|
prevLive = j;
|
|
|
|
interveningDeletable = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
j = SGLT_GET_NEXTOFFSET(lt);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prevLive == InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
/* The chain is entirely removable, so we need a DEAD tuple */
|
|
|
|
toDead[xlrec.nDead] = i;
|
|
|
|
xlrec.nDead++;
|
|
|
|
}
|
|
|
|
else if (interveningDeletable)
|
|
|
|
{
|
|
|
|
/* One or more deletions at end of chain, so close it off */
|
|
|
|
chainSrc[xlrec.nChain] = prevLive;
|
|
|
|
chainDest[xlrec.nChain] = InvalidOffsetNumber;
|
|
|
|
xlrec.nChain++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* sanity check ... */
|
|
|
|
if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove)
|
|
|
|
elog(ERROR, "inconsistent counts of deletable tuples");
|
|
|
|
|
|
|
|
/* Do the updates */
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
spgPageIndexMultiDelete(&bds->spgstate, page,
|
|
|
|
toDead, xlrec.nDead,
|
|
|
|
SPGIST_DEAD, SPGIST_DEAD,
|
|
|
|
InvalidBlockNumber, InvalidOffsetNumber);
|
|
|
|
|
|
|
|
spgPageIndexMultiDelete(&bds->spgstate, page,
|
|
|
|
toPlaceholder, xlrec.nPlaceholder,
|
|
|
|
SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
|
|
|
|
InvalidBlockNumber, InvalidOffsetNumber);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We implement the move step by swapping the line pointers of the source
|
|
|
|
* and target tuples, then replacing the newly-source tuples with
|
|
|
|
* placeholders. This is perhaps unduly friendly with the page data
|
|
|
|
* representation, but it's fast and doesn't risk page overflow when a
|
|
|
|
* tuple to be relocated is large.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < xlrec.nMove; i++)
|
|
|
|
{
|
|
|
|
ItemId idSrc = PageGetItemId(page, moveSrc[i]);
|
|
|
|
ItemId idDest = PageGetItemId(page, moveDest[i]);
|
|
|
|
ItemIdData tmp;
|
|
|
|
|
|
|
|
tmp = *idSrc;
|
|
|
|
*idSrc = *idDest;
|
|
|
|
*idDest = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
spgPageIndexMultiDelete(&bds->spgstate, page,
|
|
|
|
moveSrc, xlrec.nMove,
|
|
|
|
SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER,
|
|
|
|
InvalidBlockNumber, InvalidOffsetNumber);
|
|
|
|
|
|
|
|
for (i = 0; i < xlrec.nChain; i++)
|
|
|
|
{
|
|
|
|
SpGistLeafTuple lt;
|
|
|
|
|
|
|
|
lt = (SpGistLeafTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, chainSrc[i]));
|
|
|
|
Assert(lt->tupstate == SPGIST_LIVE);
|
|
|
|
SGLT_SET_NEXTOFFSET(lt, chainDest[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
|
|
|
if (RelationNeedsWAL(index))
|
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
XLogBeginInsert();
|
|
|
|
|
|
|
|
STORE_STATE(&bds->spgstate, xlrec.stateSrc);
|
|
|
|
|
|
|
|
XLogRegisterData(&xlrec, SizeOfSpgxlogVacuumLeaf);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
/* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
|
|
|
|
XLogRegisterData(toDead, sizeof(OffsetNumber) * xlrec.nDead);
|
|
|
|
XLogRegisterData(toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder);
|
|
|
|
XLogRegisterData(moveSrc, sizeof(OffsetNumber) * xlrec.nMove);
|
|
|
|
XLogRegisterData(moveDest, sizeof(OffsetNumber) * xlrec.nMove);
|
|
|
|
XLogRegisterData(chainSrc, sizeof(OffsetNumber) * xlrec.nChain);
|
|
|
|
XLogRegisterData(chainDest, sizeof(OffsetNumber) * xlrec.nChain);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
|
|
|
|
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF);
|
|
|
|
|
|
|
|
PageSetLSN(page, recptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Vacuum a root page when it is also a leaf
|
|
|
|
*
|
|
|
|
* On the root, we just delete any dead leaf tuples; no fancy business
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
spgxlogVacuumRoot xlrec;
|
|
|
|
OffsetNumber toDelete[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber i,
|
|
|
|
max = PageGetMaxOffsetNumber(page);
|
|
|
|
|
|
|
|
xlrec.nDelete = 0;
|
|
|
|
|
|
|
|
/* Scan page, identify tuples to delete, accumulate stats */
|
|
|
|
for (i = FirstOffsetNumber; i <= max; i++)
|
|
|
|
{
|
|
|
|
SpGistLeafTuple lt;
|
|
|
|
|
|
|
|
lt = (SpGistLeafTuple) PageGetItem(page,
|
|
|
|
PageGetItemId(page, i));
|
|
|
|
if (lt->tupstate == SPGIST_LIVE)
|
|
|
|
{
|
|
|
|
Assert(ItemPointerIsValid(<->heapPtr));
|
|
|
|
|
|
|
|
if (bds->callback(<->heapPtr, bds->callback_state))
|
|
|
|
{
|
|
|
|
bds->stats->tuples_removed += 1;
|
|
|
|
toDelete[xlrec.nDelete] = i;
|
|
|
|
xlrec.nDelete++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bds->stats->num_index_tuples += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* all tuples on root should be live */
|
|
|
|
elog(ERROR, "unexpected SPGiST tuple state: %d",
|
|
|
|
lt->tupstate);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xlrec.nDelete == 0)
|
|
|
|
return; /* nothing more to do */
|
|
|
|
|
|
|
|
/* Do the update */
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
/* The tuple numbers are in order, so we can use PageIndexMultiDelete */
|
|
|
|
PageIndexMultiDelete(page, toDelete, xlrec.nDelete);
|
|
|
|
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
|
|
|
if (RelationNeedsWAL(index))
|
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
XLogBeginInsert();
|
|
|
|
|
|
|
|
/* Prepare WAL record */
|
|
|
|
STORE_STATE(&bds->spgstate, xlrec.stateSrc);
|
|
|
|
|
|
|
|
XLogRegisterData(&xlrec, SizeOfSpgxlogVacuumRoot);
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
/* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
|
|
|
|
XLogRegisterData(toDelete,
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
sizeof(OffsetNumber) * xlrec.nDelete);
|
|
|
|
|
|
|
|
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
|
|
|
|
|
|
|
|
recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_ROOT);
|
|
|
|
|
|
|
|
PageSetLSN(page, recptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clean up redirect and placeholder tuples on the given page
|
|
|
|
*
|
|
|
|
* Redirect tuples can be marked placeholder once they're old enough.
|
|
|
|
* Placeholder tuples can be removed if it won't change the offsets of
|
|
|
|
* non-placeholder ones.
|
|
|
|
*
|
|
|
|
* Unlike the routines above, this works on both leaf and inner pages.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
vacuumRedirectAndPlaceholder(Relation index, Relation heaprel, Buffer buffer)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
|
|
|
|
OffsetNumber i,
|
|
|
|
max = PageGetMaxOffsetNumber(page),
|
|
|
|
firstPlaceholder = InvalidOffsetNumber;
|
|
|
|
bool hasNonPlaceholder = false;
|
|
|
|
bool hasUpdate = false;
|
|
|
|
OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage];
|
|
|
|
OffsetNumber itemnos[MaxIndexTuplesPerPage];
|
|
|
|
spgxlogVacuumRedirect xlrec;
|
snapshot scalability: Don't compute global horizons while building snapshots.
To make GetSnapshotData() more scalable, it cannot not look at at each proc's
xmin: While snapshot contents do not need to change whenever a read-only
transaction commits or a snapshot is released, a proc's xmin is modified in
those cases. The frequency of xmin modifications leads to, particularly on
higher core count systems, many cache misses inside GetSnapshotData(), despite
the data underlying a snapshot not changing. That is the most
significant source of GetSnapshotData() scaling poorly on larger systems.
Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons /
thresholds as it has so far. But we don't really have to: The horizons don't
actually change that much between GetSnapshotData() calls. Nor are the horizons
actually used every time a snapshot is built.
The trick this commit introduces is to delay computation of accurate horizons
until there use and using horizon boundaries to determine whether accurate
horizons need to be computed.
The use of RecentGlobal[Data]Xmin to decide whether a row version could be
removed has been replaces with new GlobalVisTest* functions. These use two
thresholds to determine whether a row can be pruned:
1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed
are definitely still visible.
2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
definitely be removed
GetSnapshotData() updates definitely_needed to be the xmin of the computed
snapshot.
When testing whether a row can be removed (with GlobalVisTestIsRemovableXid())
and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID <
definitely_needed) the boundaries can be recomputed to be more accurate. As it
is not cheap to compute accurate boundaries, we limit the number of times that
happens in short succession. As the boundaries used by
GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by
GetSnapshotData()), it is likely that further test can benefit from an earlier
computation of accurate horizons.
To avoid regressing performance when old_snapshot_threshold is set (as that
requires an accurate horizon to be computed), heap_page_prune_opt() doesn't
unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the
computation of the limited horizon, and the triggering of errors (with
SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove
tuples.
This commit just removes the accesses to PGXACT->xmin from
GetSnapshotData(), but other members of PGXACT residing in the same
cache line are accessed. Therefore this in itself does not result in a
significant improvement. Subsequent commits will take advantage of the
fact that GetSnapshotData() now does not need to access xmins anymore.
Note: This contains a workaround in heap_page_prune_opt() to keep the
snapshot_too_old tests working. While that workaround is ugly, the tests
currently are not meaningful, and it seems best to address them separately.
Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
5 years ago
|
|
|
GlobalVisState *vistest;
|
|
|
|
|
Add info in WAL records in preparation for logical slot conflict handling
This commit only implements one prerequisite part for allowing logical
decoding. The commit message contains an explanation of the overall design,
which later commits will refer back to.
Overall design:
1. We want to enable logical decoding on standbys, but replay of WAL
from the primary might remove data that is needed by logical decoding,
causing error(s) on the standby. To prevent those errors, a new replication
conflict scenario needs to be addressed (as much as hot standby does).
2. Our chosen strategy for dealing with this type of replication slot
is to invalidate logical slots for which needed data has been removed.
3. To do this we need the latestRemovedXid for each change, just as we
do for physical replication conflicts, but we also need to know
whether any particular change was to data that logical replication
might access. That way, during WAL replay, we know when there is a risk of
conflict and, if so, if there is a conflict.
4. We can't rely on the standby's relcache entries for this purpose in
any way, because the startup process can't access catalog contents.
5. Therefore every WAL record that potentially removes data from the
index or heap must carry a flag indicating whether or not it is one
that might be accessed during logical decoding.
Why do we need this for logical decoding on standby?
First, let's forget about logical decoding on standby and recall that
on a primary database, any catalog rows that may be needed by a logical
decoding replication slot are not removed.
This is done thanks to the catalog_xmin associated with the logical
replication slot.
But, with logical decoding on standby, in the following cases:
- hot_standby_feedback is off
- hot_standby_feedback is on but there is no a physical slot between
the primary and the standby. Then, hot_standby_feedback will work,
but only while the connection is alive (for example a node restart
would break it)
Then, the primary may delete system catalog rows that could be needed
by the logical decoding on the standby (as it does not know about the
catalog_xmin on the standby).
So, it’s mandatory to identify those rows and invalidate the slots
that may need them if any. Identifying those rows is the purpose of
this commit.
Implementation:
When a WAL replay on standby indicates that a catalog table tuple is
to be deleted by an xid that is greater than a logical slot's
catalog_xmin, then that means the slot's catalog_xmin conflicts with
the xid, and we need to handle the conflict. While subsequent commits
will do the actual conflict handling, this commit adds a new field
isCatalogRel in such WAL records (and a new bit set in the
xl_heap_visible flags field), that is true for catalog tables, so as to
arrange for conflict handling.
The affected WAL records are the ones that already contain the
snapshotConflictHorizon field, namely:
- gistxlogDelete
- gistxlogPageReuse
- xl_hash_vacuum_one_page
- xl_heap_prune
- xl_heap_freeze_page
- xl_heap_visible
- xl_btree_reuse_page
- xl_btree_delete
- spgxlogVacuumRedirect
Due to this new field being added, xl_hash_vacuum_one_page and
gistxlogDelete do now contain the offsets to be deleted as a
FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignment.
It's not needed on the others struct where isCatalogRel has
been added.
This commit just introduces the WAL format changes mentioned above. Handling
the actual conflicts will follow in future commits.
Bumps XLOG_PAGE_MAGIC as the several WAL records are changed.
Author: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de> (in an older version)
Author: Amit Khandekar <amitdkhan.pg@gmail.com> (in an older version)
Reviewed-by: "Drouvot, Bertrand" <bertranddrouvot.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Reviewed-by: Fabrízio de Royes Mello <fabriziomello@gmail.com>
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
2 years ago
|
|
|
xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel);
|
|
|
|
xlrec.nToPlaceholder = 0;
|
|
|
|
xlrec.snapshotConflictHorizon = InvalidTransactionId;
|
|
|
|
|
|
|
|
vistest = GlobalVisTestFor(heaprel);
|
snapshot scalability: Don't compute global horizons while building snapshots.
To make GetSnapshotData() more scalable, it cannot not look at at each proc's
xmin: While snapshot contents do not need to change whenever a read-only
transaction commits or a snapshot is released, a proc's xmin is modified in
those cases. The frequency of xmin modifications leads to, particularly on
higher core count systems, many cache misses inside GetSnapshotData(), despite
the data underlying a snapshot not changing. That is the most
significant source of GetSnapshotData() scaling poorly on larger systems.
Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons /
thresholds as it has so far. But we don't really have to: The horizons don't
actually change that much between GetSnapshotData() calls. Nor are the horizons
actually used every time a snapshot is built.
The trick this commit introduces is to delay computation of accurate horizons
until there use and using horizon boundaries to determine whether accurate
horizons need to be computed.
The use of RecentGlobal[Data]Xmin to decide whether a row version could be
removed has been replaces with new GlobalVisTest* functions. These use two
thresholds to determine whether a row can be pruned:
1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed
are definitely still visible.
2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
definitely be removed
GetSnapshotData() updates definitely_needed to be the xmin of the computed
snapshot.
When testing whether a row can be removed (with GlobalVisTestIsRemovableXid())
and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID <
definitely_needed) the boundaries can be recomputed to be more accurate. As it
is not cheap to compute accurate boundaries, we limit the number of times that
happens in short succession. As the boundaries used by
GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by
GetSnapshotData()), it is likely that further test can benefit from an earlier
computation of accurate horizons.
To avoid regressing performance when old_snapshot_threshold is set (as that
requires an accurate horizon to be computed), heap_page_prune_opt() doesn't
unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the
computation of the limited horizon, and the triggering of errors (with
SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove
tuples.
This commit just removes the accesses to PGXACT->xmin from
GetSnapshotData(), but other members of PGXACT residing in the same
cache line are accessed. Therefore this in itself does not result in a
significant improvement. Subsequent commits will take advantage of the
fact that GetSnapshotData() now does not need to access xmins anymore.
Note: This contains a workaround in heap_page_prune_opt() to keep the
snapshot_too_old tests working. While that workaround is ugly, the tests
currently are not meaningful, and it seems best to address them separately.
Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
5 years ago
|
|
|
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scan backwards to convert old redirection tuples to placeholder tuples,
|
|
|
|
* and identify location of last non-placeholder tuple while at it.
|
|
|
|
*/
|
|
|
|
for (i = max;
|
|
|
|
i >= FirstOffsetNumber &&
|
|
|
|
(opaque->nRedirection > 0 || !hasNonPlaceholder);
|
|
|
|
i--)
|
|
|
|
{
|
|
|
|
SpGistDeadTuple dt;
|
|
|
|
|
|
|
|
dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i));
|
|
|
|
|
Fix insertion of SP-GiST REDIRECT tuples during REINDEX CONCURRENTLY.
Reconstruction of an SP-GiST index by REINDEX CONCURRENTLY may
insert some REDIRECT tuples. This will typically happen in
a transaction that lacks an XID, which leads either to assertion
failure in spgFormDeadTuple or to insertion of a REDIRECT tuple
with zero xid. The latter's not good either, since eventually
VACUUM will apply GlobalVisTestIsRemovableXid() to the zero xid,
resulting in either an assertion failure or a garbage answer.
In practice, since REINDEX CONCURRENTLY locks out index scans
till it's done, it doesn't matter whether it inserts REDIRECTs
or PLACEHOLDERs; and likewise it doesn't matter how soon VACUUM
reduces such a REDIRECT to a PLACEHOLDER. So in non-assert builds
there's no observable problem here, other than perhaps a little
index bloat. But it's not behaving as intended.
To fix, remove the failing Assert in spgFormDeadTuple, acknowledging
that we might sometimes insert a zero XID; and guard VACUUM's
GlobalVisTestIsRemovableXid() call with a test for valid XID,
ensuring that we'll reduce such a REDIRECT the first time VACUUM
sees it. (Versions before v14 use TransactionIdPrecedes here,
which won't fail on zero xid, so they really have no bug at all
in non-assert builds.)
Another solution could be to not create REDIRECTs at all during
REINDEX CONCURRENTLY, making the relevant code paths treat that
case like index build (which likewise knows that no concurrent
index scans can be happening). That would allow restoring the
Assert in spgFormDeadTuple, but we'd still need the VACUUM change
because redirection tuples with zero xid may be out there already.
But there doesn't seem to be a nice way for spginsert() to tell that
it's being called in REINDEX CONCURRENTLY without some API changes,
so we'll leave that as a possible future improvement.
In HEAD, also rename the SpGistState.myXid field to redirectXid,
which seems less misleading (since it might not in fact be our
transaction's XID) and is certainly less uninformatively generic.
Per bug #18499 from Alexander Lakhin. Back-patch to all supported
branches.
Discussion: https://postgr.es/m/18499-8a519c280f956480@postgresql.org
1 year ago
|
|
|
/*
|
|
|
|
* We can convert a REDIRECT to a PLACEHOLDER if there could no longer
|
|
|
|
* be any index scans "in flight" to it. Such an index scan would
|
|
|
|
* have to be in a transaction whose snapshot sees the REDIRECT's XID
|
|
|
|
* as still running, so comparing the XID against global xmin is a
|
|
|
|
* conservatively safe test. If the XID is invalid, it must have been
|
|
|
|
* inserted by REINDEX CONCURRENTLY, so we can zap it immediately.
|
|
|
|
*/
|
|
|
|
if (dt->tupstate == SPGIST_REDIRECT &&
|
Fix insertion of SP-GiST REDIRECT tuples during REINDEX CONCURRENTLY.
Reconstruction of an SP-GiST index by REINDEX CONCURRENTLY may
insert some REDIRECT tuples. This will typically happen in
a transaction that lacks an XID, which leads either to assertion
failure in spgFormDeadTuple or to insertion of a REDIRECT tuple
with zero xid. The latter's not good either, since eventually
VACUUM will apply GlobalVisTestIsRemovableXid() to the zero xid,
resulting in either an assertion failure or a garbage answer.
In practice, since REINDEX CONCURRENTLY locks out index scans
till it's done, it doesn't matter whether it inserts REDIRECTs
or PLACEHOLDERs; and likewise it doesn't matter how soon VACUUM
reduces such a REDIRECT to a PLACEHOLDER. So in non-assert builds
there's no observable problem here, other than perhaps a little
index bloat. But it's not behaving as intended.
To fix, remove the failing Assert in spgFormDeadTuple, acknowledging
that we might sometimes insert a zero XID; and guard VACUUM's
GlobalVisTestIsRemovableXid() call with a test for valid XID,
ensuring that we'll reduce such a REDIRECT the first time VACUUM
sees it. (Versions before v14 use TransactionIdPrecedes here,
which won't fail on zero xid, so they really have no bug at all
in non-assert builds.)
Another solution could be to not create REDIRECTs at all during
REINDEX CONCURRENTLY, making the relevant code paths treat that
case like index build (which likewise knows that no concurrent
index scans can be happening). That would allow restoring the
Assert in spgFormDeadTuple, but we'd still need the VACUUM change
because redirection tuples with zero xid may be out there already.
But there doesn't seem to be a nice way for spginsert() to tell that
it's being called in REINDEX CONCURRENTLY without some API changes,
so we'll leave that as a possible future improvement.
In HEAD, also rename the SpGistState.myXid field to redirectXid,
which seems less misleading (since it might not in fact be our
transaction's XID) and is certainly less uninformatively generic.
Per bug #18499 from Alexander Lakhin. Back-patch to all supported
branches.
Discussion: https://postgr.es/m/18499-8a519c280f956480@postgresql.org
1 year ago
|
|
|
(!TransactionIdIsValid(dt->xid) ||
|
|
|
|
GlobalVisTestIsRemovableXid(vistest, dt->xid)))
|
|
|
|
{
|
|
|
|
dt->tupstate = SPGIST_PLACEHOLDER;
|
|
|
|
Assert(opaque->nRedirection > 0);
|
|
|
|
opaque->nRedirection--;
|
|
|
|
opaque->nPlaceholder++;
|
|
|
|
|
|
|
|
/* remember newest XID among the removed redirects */
|
|
|
|
if (!TransactionIdIsValid(xlrec.snapshotConflictHorizon) ||
|
|
|
|
TransactionIdPrecedes(xlrec.snapshotConflictHorizon, dt->xid))
|
|
|
|
xlrec.snapshotConflictHorizon = dt->xid;
|
|
|
|
|
|
|
|
ItemPointerSetInvalid(&dt->pointer);
|
|
|
|
|
|
|
|
itemToPlaceholder[xlrec.nToPlaceholder] = i;
|
|
|
|
xlrec.nToPlaceholder++;
|
|
|
|
|
|
|
|
hasUpdate = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dt->tupstate == SPGIST_PLACEHOLDER)
|
|
|
|
{
|
|
|
|
if (!hasNonPlaceholder)
|
|
|
|
firstPlaceholder = i;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
hasNonPlaceholder = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Any placeholder tuples at the end of page can safely be removed. We
|
|
|
|
* can't remove ones before the last non-placeholder, though, because we
|
|
|
|
* can't alter the offset numbers of non-placeholder tuples.
|
|
|
|
*/
|
|
|
|
if (firstPlaceholder != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We do not store this array to rdata because it's easy to recreate.
|
|
|
|
*/
|
|
|
|
for (i = firstPlaceholder; i <= max; i++)
|
|
|
|
itemnos[i - firstPlaceholder] = i;
|
|
|
|
|
|
|
|
i = max - firstPlaceholder + 1;
|
|
|
|
Assert(opaque->nPlaceholder >= i);
|
|
|
|
opaque->nPlaceholder -= i;
|
|
|
|
|
|
|
|
/* The array is surely sorted, so can use PageIndexMultiDelete */
|
|
|
|
PageIndexMultiDelete(page, itemnos, i);
|
|
|
|
|
|
|
|
hasUpdate = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
xlrec.firstPlaceholder = firstPlaceholder;
|
|
|
|
|
|
|
|
if (hasUpdate)
|
|
|
|
MarkBufferDirty(buffer);
|
|
|
|
|
|
|
|
if (hasUpdate && RelationNeedsWAL(index))
|
|
|
|
{
|
|
|
|
XLogRecPtr recptr;
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
XLogBeginInsert();
|
|
|
|
|
|
|
|
XLogRegisterData(&xlrec, SizeOfSpgxlogVacuumRedirect);
|
|
|
|
XLogRegisterData(itemToPlaceholder,
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
sizeof(OffsetNumber) * xlrec.nToPlaceholder);
|
|
|
|
|
|
|
|
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
|
|
|
|
|
Revamp the WAL record format.
Each WAL record now carries information about the modified relation and
block(s) in a standardized format. That makes it easier to write tools that
need that information, like pg_rewind, prefetching the blocks to speed up
recovery, etc.
There's a whole new API for building WAL records, replacing the XLogRecData
chains used previously. The new API consists of XLogRegister* functions,
which are called for each buffer and chunk of data that is added to the
record. The new API also gives more control over when a full-page image is
written, by passing flags to the XLogRegisterBuffer function.
This also simplifies the XLogReadBufferForRedo() calls. The function can dig
the relation and block number from the WAL record, so they no longer need to
be passed as arguments.
For the convenience of redo routines, XLogReader now disects each WAL record
after reading it, copying the main data part and the per-block data into
MAXALIGNed buffers. The data chunks are not aligned within the WAL record,
but the redo routines can assume that the pointers returned by XLogRecGet*
functions are. Redo routines are now passed the XLogReaderState, which
contains the record in the already-disected format, instead of the plain
XLogRecord.
The new record format also makes the fixed size XLogRecord header smaller,
by removing the xl_len field. The length of the "main data" portion is now
stored at the end of the WAL record, and there's a separate header after
XLogRecord for it. The alignment padding at the end of XLogRecord is also
removed. This compansates for the fact that the new format would otherwise
be more bulky than the old format.
Reviewed by Andres Freund, Amit Kapila, Michael Paquier, Alvaro Herrera,
Fujii Masao.
11 years ago
|
|
|
recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_REDIRECT);
|
|
|
|
|
|
|
|
PageSetLSN(page, recptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process one page during a bulkdelete scan
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
|
|
|
|
{
|
|
|
|
Relation index = bds->info->index;
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
|
|
|
/* call vacuum_delay_point while not holding any buffer lock */
|
|
|
|
vacuum_delay_point(false);
|
|
|
|
|
|
|
|
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
|
|
|
|
RBM_NORMAL, bds->info->strategy);
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
|
|
|
if (PageIsNew(page))
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We found an all-zero page, which could happen if the database
|
|
|
|
* crashed just after extending the file. Recycle it.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
else if (PageIsEmpty(page))
|
|
|
|
{
|
|
|
|
/* nothing to do */
|
|
|
|
}
|
|
|
|
else if (SpGistPageIsLeaf(page))
|
|
|
|
{
|
|
|
|
if (SpGistBlockIsRoot(blkno))
|
|
|
|
{
|
|
|
|
vacuumLeafRoot(bds, index, buffer);
|
|
|
|
/* no need for vacuumRedirectAndPlaceholder */
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
vacuumLeafPage(bds, index, buffer, false);
|
|
|
|
vacuumRedirectAndPlaceholder(index, bds->info->heaprel, buffer);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* inner page */
|
|
|
|
vacuumRedirectAndPlaceholder(index, bds->info->heaprel, buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The root pages must never be deleted, nor marked as available in FSM,
|
|
|
|
* because we don't want them ever returned by a search for a place to put
|
|
|
|
* a new tuple. Otherwise, check for empty page, and make sure the FSM
|
|
|
|
* knows about it.
|
|
|
|
*/
|
|
|
|
if (!SpGistBlockIsRoot(blkno))
|
|
|
|
{
|
|
|
|
if (PageIsNew(page) || PageIsEmpty(page))
|
|
|
|
{
|
|
|
|
RecordFreeIndexPage(index, blkno);
|
|
|
|
bds->stats->pages_deleted++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
SpGistSetLastUsedPage(index, buffer);
|
|
|
|
bds->lastFilledBlock = blkno;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Process the pending-TID list between pages of the main scan
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spgprocesspending(spgBulkDeleteState *bds)
|
|
|
|
{
|
|
|
|
Relation index = bds->info->index;
|
|
|
|
spgVacPendingItem *pitem;
|
|
|
|
spgVacPendingItem *nitem;
|
|
|
|
BlockNumber blkno;
|
|
|
|
Buffer buffer;
|
|
|
|
Page page;
|
|
|
|
|
|
|
|
for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
|
|
|
|
{
|
|
|
|
if (pitem->done)
|
|
|
|
continue; /* ignore already-done items */
|
|
|
|
|
|
|
|
/* call vacuum_delay_point while not holding any buffer lock */
|
|
|
|
vacuum_delay_point(false);
|
|
|
|
|
|
|
|
/* examine the referenced page */
|
|
|
|
blkno = ItemPointerGetBlockNumber(&pitem->tid);
|
|
|
|
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
|
|
|
|
RBM_NORMAL, bds->info->strategy);
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
page = (Page) BufferGetPage(buffer);
|
|
|
|
|
|
|
|
if (PageIsNew(page) || SpGistPageIsDeleted(page))
|
|
|
|
{
|
|
|
|
/* Probably shouldn't happen, but ignore it */
|
|
|
|
}
|
|
|
|
else if (SpGistPageIsLeaf(page))
|
|
|
|
{
|
|
|
|
if (SpGistBlockIsRoot(blkno))
|
|
|
|
{
|
|
|
|
/* this should definitely not happen */
|
|
|
|
elog(ERROR, "redirection leads to root page of index \"%s\"",
|
|
|
|
RelationGetRelationName(index));
|
|
|
|
}
|
|
|
|
|
|
|
|
/* deal with any deletable tuples */
|
|
|
|
vacuumLeafPage(bds, index, buffer, true);
|
|
|
|
/* might as well do this while we are here */
|
|
|
|
vacuumRedirectAndPlaceholder(index, bds->info->heaprel, buffer);
|
|
|
|
|
|
|
|
SpGistSetLastUsedPage(index, buffer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can mark as done not only this item, but any later ones
|
|
|
|
* pointing at the same page, since we vacuumed the whole page.
|
|
|
|
*/
|
|
|
|
pitem->done = true;
|
|
|
|
for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
|
|
|
|
{
|
|
|
|
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
|
|
|
|
nitem->done = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* On an inner page, visit the referenced inner tuple and add all
|
|
|
|
* its downlinks to the pending list. We might have pending items
|
|
|
|
* for more than one inner tuple on the same page (in fact this is
|
|
|
|
* pretty likely given the way space allocation works), so get
|
|
|
|
* them all while we are here.
|
|
|
|
*/
|
|
|
|
for (nitem = pitem; nitem != NULL; nitem = nitem->next)
|
|
|
|
{
|
|
|
|
if (nitem->done)
|
|
|
|
continue;
|
|
|
|
if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
|
|
|
|
{
|
|
|
|
OffsetNumber offset;
|
|
|
|
SpGistInnerTuple innerTuple;
|
|
|
|
|
|
|
|
offset = ItemPointerGetOffsetNumber(&nitem->tid);
|
|
|
|
innerTuple = (SpGistInnerTuple) PageGetItem(page,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
PageGetItemId(page, offset));
|
|
|
|
if (innerTuple->tupstate == SPGIST_LIVE)
|
|
|
|
{
|
|
|
|
SpGistNodeTuple node;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
SGITITERATE(innerTuple, i, node)
|
|
|
|
{
|
|
|
|
if (ItemPointerIsValid(&node->t_tid))
|
|
|
|
spgAddPendingTID(bds, &node->t_tid);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (innerTuple->tupstate == SPGIST_REDIRECT)
|
|
|
|
{
|
|
|
|
/* transfer attention to redirect point */
|
|
|
|
spgAddPendingTID(bds,
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
&((SpGistDeadTuple) innerTuple)->pointer);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
elog(ERROR, "unexpected SPGiST tuple state: %d",
|
|
|
|
innerTuple->tupstate);
|
|
|
|
|
|
|
|
nitem->done = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
spgClearPendingList(bds);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform a bulkdelete scan
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
spgvacuumscan(spgBulkDeleteState *bds)
|
|
|
|
{
|
|
|
|
Relation index = bds->info->index;
|
|
|
|
bool needLock;
|
|
|
|
BlockNumber num_pages,
|
|
|
|
blkno;
|
|
|
|
|
|
|
|
/* Finish setting up spgBulkDeleteState */
|
|
|
|
initSpGistState(&bds->spgstate, index);
|
|
|
|
bds->pendingList = NULL;
|
|
|
|
bds->myXmin = GetActiveSnapshot()->xmin;
|
|
|
|
bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset counts that will be incremented during the scan; needed in case
|
|
|
|
* of multiple scans during a single VACUUM command
|
|
|
|
*/
|
|
|
|
bds->stats->estimated_count = false;
|
|
|
|
bds->stats->num_index_tuples = 0;
|
|
|
|
bds->stats->pages_deleted = 0;
|
|
|
|
|
|
|
|
/* We can skip locking for new or temp relations */
|
|
|
|
needLock = !RELATION_IS_LOCAL(index);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The outer loop iterates over all index pages except the metapage, in
|
|
|
|
* physical order (we hope the kernel will cooperate in providing
|
|
|
|
* read-ahead for speed). It is critical that we visit all leaf pages,
|
|
|
|
* including ones added after we start the scan, else we might fail to
|
|
|
|
* delete some deletable tuples. See more extensive comments about this
|
|
|
|
* in btvacuumscan().
|
|
|
|
*/
|
|
|
|
blkno = SPGIST_METAPAGE_BLKNO + 1;
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
/* Get the current relation length */
|
|
|
|
if (needLock)
|
|
|
|
LockRelationForExtension(index, ExclusiveLock);
|
|
|
|
num_pages = RelationGetNumberOfBlocks(index);
|
|
|
|
if (needLock)
|
|
|
|
UnlockRelationForExtension(index, ExclusiveLock);
|
|
|
|
|
|
|
|
/* Quit if we've scanned the whole relation */
|
|
|
|
if (blkno >= num_pages)
|
|
|
|
break;
|
|
|
|
/* Iterate over pages, then loop back to recheck length */
|
|
|
|
for (; blkno < num_pages; blkno++)
|
|
|
|
{
|
|
|
|
spgvacuumpage(bds, blkno);
|
|
|
|
/* empty the pending-list after each page */
|
|
|
|
if (bds->pendingList != NULL)
|
|
|
|
spgprocesspending(bds);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Propagate local lastUsedPages cache to metablock */
|
|
|
|
SpGistUpdateMetaPage(index);
|
|
|
|
|
Do index FSM vacuuming sooner.
In btree and SP-GiST indexes, move the responsibility for calling
IndexFreeSpaceMapVacuum from the vacuumcleanup phase to the bulkdelete
phase, and do it if and only if we found some pages that could be put into
FSM. As in commit 851a26e26, the idea is to make free pages visible to FSM
searchers sooner when vacuuming very large tables (large enough to need
multiple bulkdelete scans). This adds more redundant work than that commit
did, since we have to scan the entire index FSM each time rather than being
able to localize what needs to be updated; but it still seems worthwhile.
However, we can buy something back by not touching the FSM at all when
there are no pages that can be put in it. That will result in slower
recovery from corrupt upper FSM pages in such a scenario, but it doesn't
seem like that's a case we need to optimize for.
Hash indexes don't use FSM at all. GIN, GiST, and bloom indexes update
FSM during the vacuumcleanup phase not bulkdelete, so that doing something
comparable to this would be a much more invasive change, and it's not clear
it's worth it. BRIN indexes do things sufficiently differently that this
change doesn't apply to them, either.
Claudio Freire, reviewed by Masahiko Sawada and Jing Wang, some additional
tweaks by me
Discussion: https://postgr.es/m/CAGTBQpYR0uJCNTt3M5GOzBRHo+-GccNO1nCaQ8yEJmZKSW5q1A@mail.gmail.com
8 years ago
|
|
|
/*
|
|
|
|
* If we found any empty pages (and recorded them in the FSM), then
|
|
|
|
* forcibly update the upper-level FSM pages to ensure that searchers can
|
|
|
|
* find them. It's possible that the pages were also found during
|
|
|
|
* previous scans and so this is a waste of time, but it's cheap enough
|
|
|
|
* relative to scanning the index that it shouldn't matter much, and
|
|
|
|
* making sure that free pages are available sooner not later seems
|
|
|
|
* worthwhile.
|
|
|
|
*
|
|
|
|
* Note that if no empty pages exist, we don't bother vacuuming the FSM at
|
|
|
|
* all.
|
|
|
|
*/
|
|
|
|
if (bds->stats->pages_deleted > 0)
|
|
|
|
IndexFreeSpaceMapVacuum(index);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Truncate index if possible
|
|
|
|
*
|
|
|
|
* XXX disabled because it's unsafe due to possible concurrent inserts.
|
|
|
|
* We'd have to rescan the pages to make sure they're still empty, and it
|
|
|
|
* doesn't seem worth it. Note that btree doesn't do this either.
|
|
|
|
*
|
|
|
|
* Another reason not to truncate is that it could invalidate the cached
|
|
|
|
* pages-with-freespace pointers in the metapage and other backends'
|
|
|
|
* relation caches, that is leave them pointing to nonexistent pages.
|
|
|
|
* Adding RelationGetNumberOfBlocks calls to protect the places that use
|
|
|
|
* those pointers would be unduly expensive.
|
|
|
|
*/
|
|
|
|
#ifdef NOT_USED
|
|
|
|
if (num_pages > bds->lastFilledBlock + 1)
|
|
|
|
{
|
|
|
|
BlockNumber lastBlock = num_pages - 1;
|
|
|
|
|
|
|
|
num_pages = bds->lastFilledBlock + 1;
|
|
|
|
RelationTruncate(index, num_pages);
|
|
|
|
bds->stats->pages_removed += lastBlock - bds->lastFilledBlock;
|
|
|
|
bds->stats->pages_deleted -= lastBlock - bds->lastFilledBlock;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Report final stats */
|
|
|
|
bds->stats->num_pages = num_pages;
|
|
|
|
bds->stats->pages_newly_deleted = bds->stats->pages_deleted;
|
|
|
|
bds->stats->pages_free = bds->stats->pages_deleted;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bulk deletion of all index entries pointing to a set of heap tuples.
|
|
|
|
* The set of target tuples is specified via a callback routine that tells
|
|
|
|
* whether any given heap tuple (identified by ItemPointer) is being deleted.
|
|
|
|
*
|
|
|
|
* Result: a palloc'd struct containing statistical info for VACUUM displays.
|
|
|
|
*/
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
IndexBulkDeleteResult *
|
|
|
|
spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
|
|
|
|
IndexBulkDeleteCallback callback, void *callback_state)
|
|
|
|
{
|
|
|
|
spgBulkDeleteState bds;
|
|
|
|
|
|
|
|
/* allocate stats if first time through, else re-use existing struct */
|
|
|
|
if (stats == NULL)
|
|
|
|
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
|
|
|
bds.info = info;
|
|
|
|
bds.stats = stats;
|
|
|
|
bds.callback = callback;
|
|
|
|
bds.callback_state = callback_state;
|
|
|
|
|
|
|
|
spgvacuumscan(&bds);
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
return stats;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Dummy callback to delete no tuples during spgvacuumcleanup */
|
|
|
|
static bool
|
|
|
|
dummy_callback(ItemPointer itemptr, void *state)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Post-VACUUM cleanup.
|
|
|
|
*
|
|
|
|
* Result: a palloc'd struct containing statistical info for VACUUM displays.
|
|
|
|
*/
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
IndexBulkDeleteResult *
|
|
|
|
spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
|
|
|
|
{
|
|
|
|
spgBulkDeleteState bds;
|
|
|
|
|
|
|
|
/* No-op in ANALYZE ONLY mode */
|
|
|
|
if (info->analyze_only)
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
return stats;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't need to scan the index if there was a preceding bulkdelete
|
|
|
|
* pass. Otherwise, make a pass that won't delete any live tuples, but
|
Do index FSM vacuuming sooner.
In btree and SP-GiST indexes, move the responsibility for calling
IndexFreeSpaceMapVacuum from the vacuumcleanup phase to the bulkdelete
phase, and do it if and only if we found some pages that could be put into
FSM. As in commit 851a26e26, the idea is to make free pages visible to FSM
searchers sooner when vacuuming very large tables (large enough to need
multiple bulkdelete scans). This adds more redundant work than that commit
did, since we have to scan the entire index FSM each time rather than being
able to localize what needs to be updated; but it still seems worthwhile.
However, we can buy something back by not touching the FSM at all when
there are no pages that can be put in it. That will result in slower
recovery from corrupt upper FSM pages in such a scenario, but it doesn't
seem like that's a case we need to optimize for.
Hash indexes don't use FSM at all. GIN, GiST, and bloom indexes update
FSM during the vacuumcleanup phase not bulkdelete, so that doing something
comparable to this would be a much more invasive change, and it's not clear
it's worth it. BRIN indexes do things sufficiently differently that this
change doesn't apply to them, either.
Claudio Freire, reviewed by Masahiko Sawada and Jing Wang, some additional
tweaks by me
Discussion: https://postgr.es/m/CAGTBQpYR0uJCNTt3M5GOzBRHo+-GccNO1nCaQ8yEJmZKSW5q1A@mail.gmail.com
8 years ago
|
|
|
* might still accomplish useful stuff with redirect/placeholder cleanup
|
|
|
|
* and/or FSM housekeeping, and in any case will provide stats.
|
|
|
|
*/
|
|
|
|
if (stats == NULL)
|
|
|
|
{
|
|
|
|
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
|
|
|
bds.info = info;
|
|
|
|
bds.stats = stats;
|
|
|
|
bds.callback = dummy_callback;
|
|
|
|
bds.callback_state = NULL;
|
|
|
|
|
|
|
|
spgvacuumscan(&bds);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It's quite possible for us to be fooled by concurrent tuple moves into
|
|
|
|
* double-counting some index tuples, so disbelieve any total that exceeds
|
|
|
|
* the underlying heap's count ... if we know that accurately. Otherwise
|
|
|
|
* this might just make matters worse.
|
|
|
|
*/
|
|
|
|
if (!info->estimated_count)
|
|
|
|
{
|
|
|
|
if (stats->num_index_tuples > info->num_heap_tuples)
|
|
|
|
stats->num_index_tuples = info->num_heap_tuples;
|
|
|
|
}
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
return stats;
|
|
|
|
}
|