|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* gistutil.c
|
|
|
|
* utilities routines for the postgres GiST index access method.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
|
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
|
|
|
* IDENTIFICATION
|
|
|
|
* src/backend/access/gist/gistutil.c
|
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include <math.h>
|
|
|
|
|
|
|
|
#include "access/gist_private.h"
|
|
|
|
#include "access/htup_details.h"
|
|
|
|
#include "access/reloptions.h"
|
|
|
|
#include "catalog/pg_opclass.h"
|
|
|
|
#include "storage/indexfsm.h"
|
|
|
|
#include "storage/lmgr.h"
|
|
|
|
#include "utils/float.h"
|
|
|
|
#include "utils/syscache.h"
|
|
|
|
#include "utils/lsyscache.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write itup vector to page, has no control of free space.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
|
|
|
|
{
|
|
|
|
OffsetNumber l = InvalidOffsetNumber;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (off == InvalidOffsetNumber)
|
|
|
|
off = (PageIsEmpty(page)) ? FirstOffsetNumber :
|
|
|
|
OffsetNumberNext(PageGetMaxOffsetNumber(page));
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
{
|
|
|
|
Size sz = IndexTupleSize(itup[i]);
|
|
|
|
|
|
|
|
l = PageAddItem(page, (Item) itup[i], sz, off, false, false);
|
|
|
|
if (l == InvalidOffsetNumber)
|
|
|
|
elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes",
|
|
|
|
i, len, (int) sz);
|
|
|
|
off++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check space for itup vector on page
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
|
|
|
|
{
|
|
|
|
unsigned int size = freespace,
|
|
|
|
deleted = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
|
|
|
|
|
|
|
|
if (todelete != InvalidOffsetNumber)
|
|
|
|
{
|
|
|
|
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete));
|
|
|
|
|
|
|
|
deleted = IndexTupleSize(itup) + sizeof(ItemIdData);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (PageGetFreeSpace(page) + deleted < size);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
gistfitpage(IndexTuple *itvec, int len)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
Size size = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
|
|
|
|
|
|
|
|
/* TODO: Consider fillfactor */
|
|
|
|
return (size <= GiSTPageSize);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read buffer into itup vector
|
|
|
|
*/
|
|
|
|
IndexTuple *
|
|
|
|
gistextractpage(Page page, int *len /* out */ )
|
|
|
|
{
|
|
|
|
OffsetNumber i,
|
|
|
|
maxoff;
|
|
|
|
IndexTuple *itvec;
|
|
|
|
|
|
|
|
maxoff = PageGetMaxOffsetNumber(page);
|
|
|
|
*len = maxoff;
|
|
|
|
itvec = palloc(sizeof(IndexTuple) * maxoff);
|
|
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
|
|
|
itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
|
|
|
|
|
|
|
|
return itvec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* join two vectors into one
|
|
|
|
*/
|
|
|
|
IndexTuple *
|
|
|
|
gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
|
|
|
|
{
|
|
|
|
itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen));
|
|
|
|
memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen);
|
|
|
|
*len += addlen;
|
|
|
|
return itvec;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* make plain IndexTupleVector
|
|
|
|
*/
|
|
|
|
|
|
|
|
IndexTupleData *
|
|
|
|
gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
|
|
|
|
{
|
|
|
|
char *ptr,
|
|
|
|
*ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
*memlen = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < veclen; i++)
|
|
|
|
*memlen += IndexTupleSize(vec[i]);
|
|
|
|
|
|
|
|
ptr = ret = palloc(*memlen);
|
|
|
|
|
|
|
|
for (i = 0; i < veclen; i++)
|
|
|
|
{
|
|
|
|
memcpy(ptr, vec[i], IndexTupleSize(vec[i]));
|
|
|
|
ptr += IndexTupleSize(vec[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (IndexTupleData *) ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
* Make unions of keys in IndexTuple vector (one union datum per index column).
|
|
|
|
* Union Datums are returned into the attr/isnull arrays.
|
|
|
|
* Resulting Datums aren't compressed.
|
|
|
|
*/
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
15 years ago
|
|
|
void
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
|
|
|
|
Datum *attr, bool *isnull)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
GistEntryVector *evec;
|
|
|
|
int attrsize;
|
|
|
|
|
|
|
|
evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ);
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
for (i = 0; i < giststate->tupdesc->natts; i++)
|
|
|
|
{
|
|
|
|
int j;
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
/* Collect non-null datums for this column */
|
|
|
|
evec->n = 0;
|
|
|
|
for (j = 0; j < len; j++)
|
|
|
|
{
|
|
|
|
Datum datum;
|
|
|
|
bool IsNull;
|
|
|
|
|
|
|
|
datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull);
|
|
|
|
if (IsNull)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
gistdentryinit(giststate, i,
|
|
|
|
evec->vector + evec->n,
|
|
|
|
datum,
|
|
|
|
NULL, NULL, (OffsetNumber) 0,
|
|
|
|
false, IsNull);
|
|
|
|
evec->n++;
|
|
|
|
}
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
/* If this column was all NULLs, the union is NULL */
|
|
|
|
if (evec->n == 0)
|
|
|
|
{
|
|
|
|
attr[i] = (Datum) 0;
|
|
|
|
isnull[i] = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (evec->n == 1)
|
|
|
|
{
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
/* unionFn may expect at least two inputs */
|
|
|
|
evec->n = 2;
|
|
|
|
evec->vector[1] = evec->vector[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Make union and store in attr array */
|
|
|
|
attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
|
|
|
|
giststate->supportCollation[i],
|
|
|
|
PointerGetDatum(evec),
|
|
|
|
PointerGetDatum(&attrsize));
|
|
|
|
|
|
|
|
isnull[i] = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return an IndexTuple containing the result of applying the "union"
|
|
|
|
* method to the specified IndexTuple vector.
|
|
|
|
*/
|
|
|
|
IndexTuple
|
|
|
|
gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
|
|
|
|
{
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
Datum attr[INDEX_MAX_KEYS];
|
|
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
gistMakeUnionItVec(giststate, itvec, len, attr, isnull);
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
return gistFormTuple(giststate, r, attr, isnull, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* makes union of two key
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gistMakeUnionKey(GISTSTATE *giststate, int attno,
|
|
|
|
GISTENTRY *entry1, bool isnull1,
|
|
|
|
GISTENTRY *entry2, bool isnull2,
|
|
|
|
Datum *dst, bool *dstisnull)
|
|
|
|
{
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
/* we need a GistEntryVector with room for exactly 2 elements */
|
|
|
|
union
|
|
|
|
{
|
|
|
|
GistEntryVector gev;
|
|
|
|
char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ];
|
|
|
|
} storage;
|
|
|
|
GistEntryVector *evec = &storage.gev;
|
|
|
|
int dstsize;
|
|
|
|
|
|
|
|
evec->n = 2;
|
|
|
|
|
|
|
|
if (isnull1 && isnull2)
|
|
|
|
{
|
|
|
|
*dstisnull = true;
|
|
|
|
*dst = (Datum) 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (isnull1 == false && isnull2 == false)
|
|
|
|
{
|
|
|
|
evec->vector[0] = *entry1;
|
|
|
|
evec->vector[1] = *entry2;
|
|
|
|
}
|
|
|
|
else if (isnull1 == false)
|
|
|
|
{
|
|
|
|
evec->vector[0] = *entry1;
|
|
|
|
evec->vector[1] = *entry1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
evec->vector[0] = *entry2;
|
|
|
|
evec->vector[1] = *entry2;
|
|
|
|
}
|
|
|
|
|
|
|
|
*dstisnull = false;
|
|
|
|
*dst = FunctionCall2Coll(&giststate->unionFn[attno],
|
|
|
|
giststate->supportCollation[attno],
|
|
|
|
PointerGetDatum(evec),
|
|
|
|
PointerGetDatum(&dstsize));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
|
|
|
|
{
|
|
|
|
bool result;
|
|
|
|
|
|
|
|
FunctionCall3Coll(&giststate->equalFn[attno],
|
|
|
|
giststate->supportCollation[attno],
|
|
|
|
a, b,
|
|
|
|
PointerGetDatum(&result));
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decompress all keys in tuple
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
|
|
|
|
OffsetNumber o, GISTENTRY *attdata, bool *isnull)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < r->rd_att->natts; i++)
|
|
|
|
{
|
|
|
|
Datum datum;
|
|
|
|
|
|
|
|
datum = index_getattr(tuple, i + 1, giststate->tupdesc, &isnull[i]);
|
|
|
|
gistdentryinit(giststate, i, &attdata[i],
|
|
|
|
datum, r, p, o,
|
|
|
|
false, isnull[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Forms union of oldtup and addtup, if union == oldtup then return NULL
|
|
|
|
*/
|
|
|
|
IndexTuple
|
|
|
|
gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
|
|
|
|
{
|
|
|
|
bool neednew = false;
|
|
|
|
GISTENTRY oldentries[INDEX_MAX_KEYS],
|
|
|
|
addentries[INDEX_MAX_KEYS];
|
|
|
|
bool oldisnull[INDEX_MAX_KEYS],
|
|
|
|
addisnull[INDEX_MAX_KEYS];
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
Datum attr[INDEX_MAX_KEYS];
|
|
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
|
|
IndexTuple newtup = NULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
gistDeCompressAtt(giststate, r, oldtup, NULL,
|
|
|
|
(OffsetNumber) 0, oldentries, oldisnull);
|
|
|
|
|
|
|
|
gistDeCompressAtt(giststate, r, addtup, NULL,
|
|
|
|
(OffsetNumber) 0, addentries, addisnull);
|
|
|
|
|
|
|
|
for (i = 0; i < r->rd_att->natts; i++)
|
|
|
|
{
|
|
|
|
gistMakeUnionKey(giststate, i,
|
|
|
|
oldentries + i, oldisnull[i],
|
|
|
|
addentries + i, addisnull[i],
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
attr + i, isnull + i);
|
|
|
|
|
|
|
|
if (neednew)
|
|
|
|
/* we already need new key, so we can skip check */
|
|
|
|
continue;
|
|
|
|
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
if (isnull[i])
|
|
|
|
/* union of key may be NULL if and only if both keys are NULL */
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!addisnull[i])
|
|
|
|
{
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
if (oldisnull[i] ||
|
|
|
|
!gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i]))
|
|
|
|
neednew = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (neednew)
|
|
|
|
{
|
|
|
|
/* need to update key */
|
Repair bugs in GiST page splitting code for multi-column indexes.
When considering a non-last column in a multi-column GiST index,
gistsplit.c tries to improve on the split chosen by the opclass-specific
pickSplit function by considering penalties for the next column. However,
there were two bugs in this code: it failed to recompute the union keys for
the leftmost index columns, even though these might well change after
reassigning tuples; and it included the old union keys in the recomputation
for the columns it did recompute, so that those keys couldn't get smaller
even if they should. The first problem could result in an invalid index
in which searches wouldn't find index entries that are in fact present;
the second would make the index less efficient to search.
Both of these errors were caused by misuse of gistMakeUnionItVec, whose
API was designed in a way that just begged such errors to be made. There
is no situation in which it's safe or useful to compute the union keys for
a subset of the index columns, and there is no caller that wants any
previous union keys to be included in the computation; so the undocumented
choice to treat the union keys as in/out rather than pure output parameters
is a waste of code as well as being dangerous.
Hence, rather than just making a minimal patch, I've changed the API of
gistMakeUnionItVec to remove the "startkey" parameter (it now always
processes all index columns) and treat the attr/isnull arrays as purely
output parameters.
In passing, also get rid of a couple of unnecessary and dangerous uses
of static variables in gistutil.c. It's remarkable that the one in
gistMakeUnionKey hasn't given us portability troubles before now, because
in addition to posing a re-entrancy hazard, it was unsafely assuming that
a static char[] array would have at least Datum alignment.
Per investigation of a trouble report from Tomas Vondra. (There are also
some bugs in contrib/btree_gist to be fixed, but that seems like material
for a separate patch.) Back-patch to all supported branches.
13 years ago
|
|
|
newtup = gistFormTuple(giststate, r, attr, isnull, false);
|
|
|
|
newtup->t_tid = oldtup->t_tid;
|
|
|
|
}
|
|
|
|
|
|
|
|
return newtup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Search an upper index page for the entry with lowest penalty for insertion
|
|
|
|
* of the new index key contained in "it".
|
|
|
|
*
|
|
|
|
* Returns the index of the page entry to insert into.
|
|
|
|
*/
|
|
|
|
OffsetNumber
|
|
|
|
gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
|
|
|
|
GISTSTATE *giststate)
|
|
|
|
{
|
|
|
|
OffsetNumber result;
|
|
|
|
OffsetNumber maxoff;
|
|
|
|
OffsetNumber i;
|
|
|
|
float best_penalty[INDEX_MAX_KEYS];
|
|
|
|
GISTENTRY entry,
|
|
|
|
identry[INDEX_MAX_KEYS];
|
|
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
|
|
int keep_current_best;
|
|
|
|
|
|
|
|
Assert(!GistPageIsLeaf(p));
|
|
|
|
|
|
|
|
gistDeCompressAtt(giststate, r,
|
|
|
|
it, NULL, (OffsetNumber) 0,
|
|
|
|
identry, isnull);
|
|
|
|
|
|
|
|
/* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
|
|
|
|
result = FirstOffsetNumber;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The index may have multiple columns, and there's a penalty value for
|
|
|
|
* each column. The penalty associated with a column that appears earlier
|
|
|
|
* in the index definition is strictly more important than the penalty of
|
|
|
|
* a column that appears later in the index definition.
|
|
|
|
*
|
|
|
|
* best_penalty[j] is the best penalty we have seen so far for column j,
|
|
|
|
* or -1 when we haven't yet examined column j. Array entries to the
|
|
|
|
* right of the first -1 are undefined.
|
|
|
|
*/
|
|
|
|
best_penalty[0] = -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we find a tuple that's exactly as good as the currently best one, we
|
|
|
|
* could use either one. When inserting a lot of tuples with the same or
|
|
|
|
* similar keys, it's preferable to descend down the same path when
|
|
|
|
* possible, as that's more cache-friendly. On the other hand, if all
|
|
|
|
* inserts land on the same leaf page after a split, we're never going to
|
|
|
|
* insert anything to the other half of the split, and will end up using
|
|
|
|
* only 50% of the available space. Distributing the inserts evenly would
|
|
|
|
* lead to better space usage, but that hurts cache-locality during
|
|
|
|
* insertion. To get the best of both worlds, when we find a tuple that's
|
|
|
|
* exactly as good as the previous best, choose randomly whether to stick
|
|
|
|
* to the old best, or use the new one. Once we decide to stick to the
|
|
|
|
* old best, we keep sticking to it for any subsequent equally good tuples
|
|
|
|
* we might find. This favors tuples with low offsets, but still allows
|
|
|
|
* some inserts to go to other equally-good subtrees.
|
|
|
|
*
|
|
|
|
* keep_current_best is -1 if we haven't yet had to make a random choice
|
|
|
|
* whether to keep the current best tuple. If we have done so, and
|
|
|
|
* decided to keep it, keep_current_best is 1; if we've decided to
|
|
|
|
* replace, keep_current_best is 0. (This state will be reset to -1 as
|
|
|
|
* soon as we've made the replacement, but sometimes we make the choice in
|
|
|
|
* advance of actually finding a replacement best tuple.)
|
|
|
|
*/
|
|
|
|
keep_current_best = -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop over tuples on page.
|
|
|
|
*/
|
|
|
|
maxoff = PageGetMaxOffsetNumber(p);
|
|
|
|
Assert(maxoff >= FirstOffsetNumber);
|
|
|
|
|
|
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
|
|
|
{
|
|
|
|
IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
|
|
|
|
bool zero_penalty;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
zero_penalty = true;
|
|
|
|
|
|
|
|
/* Loop over index attributes. */
|
|
|
|
for (j = 0; j < r->rd_att->natts; j++)
|
|
|
|
{
|
|
|
|
Datum datum;
|
|
|
|
float usize;
|
|
|
|
bool IsNull;
|
|
|
|
|
|
|
|
/* Compute penalty for this column. */
|
|
|
|
datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull);
|
|
|
|
gistdentryinit(giststate, j, &entry, datum, r, p, i,
|
|
|
|
false, IsNull);
|
|
|
|
usize = gistpenalty(giststate, j, &entry, IsNull,
|
|
|
|
&identry[j], isnull[j]);
|
|
|
|
if (usize > 0)
|
|
|
|
zero_penalty = false;
|
|
|
|
|
|
|
|
if (best_penalty[j] < 0 || usize < best_penalty[j])
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* New best penalty for column. Tentatively select this tuple
|
|
|
|
* as the target, and record the best penalty. Then reset the
|
|
|
|
* next column's penalty to "unknown" (and indirectly, the
|
|
|
|
* same for all the ones to its right). This will force us to
|
|
|
|
* adopt this tuple's penalty values as the best for all the
|
|
|
|
* remaining columns during subsequent loop iterations.
|
|
|
|
*/
|
|
|
|
result = i;
|
|
|
|
best_penalty[j] = usize;
|
|
|
|
|
|
|
|
if (j < r->rd_att->natts - 1)
|
|
|
|
best_penalty[j + 1] = -1;
|
|
|
|
|
|
|
|
/* we have new best, so reset keep-it decision */
|
|
|
|
keep_current_best = -1;
|
|
|
|
}
|
|
|
|
else if (best_penalty[j] == usize)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The current tuple is exactly as good for this column as the
|
|
|
|
* best tuple seen so far. The next iteration of this loop
|
|
|
|
* will compare the next column.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The current tuple is worse for this column than the best
|
|
|
|
* tuple seen so far. Skip the remaining columns and move on
|
|
|
|
* to the next tuple, if any.
|
|
|
|
*/
|
|
|
|
zero_penalty = false; /* so outer loop won't exit */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we looped past the last column, and did not update "result",
|
|
|
|
* then this tuple is exactly as good as the prior best tuple.
|
|
|
|
*/
|
|
|
|
if (j == r->rd_att->natts && result != i)
|
|
|
|
{
|
|
|
|
if (keep_current_best == -1)
|
|
|
|
{
|
|
|
|
/* we didn't make the random choice yet for this old best */
|
|
|
|
keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
|
|
|
|
}
|
|
|
|
if (keep_current_best == 0)
|
|
|
|
{
|
|
|
|
/* we choose to use the new tuple */
|
|
|
|
result = i;
|
|
|
|
/* choose again if there are even more exactly-as-good ones */
|
|
|
|
keep_current_best = -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we find a tuple with zero penalty for all columns, and we've
|
|
|
|
* decided we don't want to search for another tuple with equal
|
|
|
|
* penalty, there's no need to examine remaining tuples; just break
|
|
|
|
* out of the loop and return it.
|
|
|
|
*/
|
|
|
|
if (zero_penalty)
|
|
|
|
{
|
|
|
|
if (keep_current_best == -1)
|
|
|
|
{
|
|
|
|
/* we didn't make the random choice yet for this old best */
|
|
|
|
keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
|
|
|
|
}
|
|
|
|
if (keep_current_best == 1)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize a GiST entry with a decompressed version of key
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
|
|
|
|
Datum k, Relation r, Page pg, OffsetNumber o,
|
|
|
|
bool l, bool isNull)
|
|
|
|
{
|
|
|
|
if (!isNull)
|
|
|
|
{
|
|
|
|
GISTENTRY *dep;
|
|
|
|
|
|
|
|
gistentryinit(*e, k, r, pg, o, l);
|
|
|
|
|
|
|
|
/* there may not be a decompress function in opclass */
|
|
|
|
if (!OidIsValid(giststate->decompressFn[nkey].fn_oid))
|
|
|
|
return;
|
|
|
|
|
|
|
|
dep = (GISTENTRY *)
|
|
|
|
DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
giststate->supportCollation[nkey],
|
|
|
|
PointerGetDatum(e)));
|
|
|
|
/* decompressFn may just return the given pointer */
|
|
|
|
if (dep != e)
|
|
|
|
gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
|
|
|
|
dep->leafkey);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
gistentryinit(*e, (Datum) 0, r, pg, o, l);
|
|
|
|
}
|
|
|
|
|
|
|
|
IndexTuple
|
|
|
|
gistFormTuple(GISTSTATE *giststate, Relation r,
|
|
|
|
Datum attdata[], bool isnull[], bool isleaf)
|
|
|
|
{
|
|
|
|
Datum compatt[INDEX_MAX_KEYS];
|
|
|
|
int i;
|
|
|
|
IndexTuple res;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Call the compress method on each attribute.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < r->rd_att->natts; i++)
|
|
|
|
{
|
|
|
|
if (isnull[i])
|
|
|
|
compatt[i] = (Datum) 0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
GISTENTRY centry;
|
|
|
|
GISTENTRY *cep;
|
|
|
|
|
|
|
|
gistentryinit(centry, attdata[i], r, NULL, (OffsetNumber) 0,
|
|
|
|
isleaf);
|
|
|
|
/* there may not be a compress function in opclass */
|
|
|
|
if (OidIsValid(giststate->compressFn[i].fn_oid))
|
|
|
|
cep = (GISTENTRY *)
|
|
|
|
DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[i],
|
|
|
|
giststate->supportCollation[i],
|
|
|
|
PointerGetDatum(¢ry)));
|
|
|
|
else
|
|
|
|
cep = ¢ry;
|
|
|
|
compatt[i] = cep->key;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
res = index_form_tuple(giststate->tupdesc, compatt, isnull);
|
|
|
|
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
15 years ago
|
|
|
/*
|
|
|
|
* The offset number on tuples on internal pages is unused. For historical
|
|
|
|
* reasons, it is set to 0xffff.
|
Rewrite the GiST insertion logic so that we don't need the post-recovery
cleanup stage to finish incomplete inserts or splits anymore. There was two
reasons for the cleanup step:
1. When a new tuple was inserted to a leaf page, the downlink in the parent
needed to be updated to contain (ie. to be consistent with) the new key.
Updating the parent in turn might require recursively updating the parent of
the parent. We now handle that by updating the parent while traversing down
the tree, so that when we insert the leaf tuple, all the parents are already
consistent with the new key, and the tree is consistent at every step.
2. When a page is split, we need to insert the downlink for the new right
page(s), and update the downlink for the original page to not include keys
that moved to the right page(s). We now handle that by setting a new flag,
F_FOLLOW_RIGHT, on the non-rightmost pages in the split. When that flag is
set, scans always follow the rightlink, regardless of the NSN mechanism used
to detect concurrent page splits. That way the tree is consistent right after
split, even though the downlink is still missing. This is very similar to the
way B-tree splits are handled. When the downlink is inserted in the parent,
the flag is cleared. To keep the insertion algorithm simple, when an
insertion sees an incomplete split, indicated by the F_FOLLOW_RIGHT flag, it
finishes the split before doing anything else.
These changes allow removing the whole "invalid tuple" mechanism, but I
retained the scan code to still follow invalid tuples correctly. While we
don't create any such tuples anymore, we want to handle them gracefully in
case you pg_upgrade a GiST index that has them. If we encounter any on an
insert, though, we just throw an error saying that you need to REINDEX.
The issue that got me into doing this is that if you did a checkpoint while
an insert or split was in progress, and the checkpoint finishes quickly so
that there is no WAL record related to the insert between RedoRecPtr and the
checkpoint record, recovery from that checkpoint would not know to finish
the incomplete insert. IOW, we have the same issue we solved with the
rm_safe_restartpoint mechanism during normal operation too. It's highly
unlikely to happen in practice, and this fix is far too large to backpatch,
so we're just going to live with in previous versions, but this refactoring
fixes it going forward.
With this patch, you don't get the annoying
'index "FOO" needs VACUUM or REINDEX to finish crash recovery' notices
anymore if you crash at an unfortunate moment.
15 years ago
|
|
|
*/
|
|
|
|
ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* initialize a GiST entry with fetched value in key field
|
|
|
|
*/
|
|
|
|
static Datum
|
|
|
|
gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r)
|
|
|
|
{
|
|
|
|
GISTENTRY fentry;
|
|
|
|
GISTENTRY *fep;
|
|
|
|
|
|
|
|
gistentryinit(fentry, k, r, NULL, (OffsetNumber) 0, false);
|
|
|
|
|
|
|
|
fep = (GISTENTRY *)
|
|
|
|
DatumGetPointer(FunctionCall1Coll(&giststate->fetchFn[nkey],
|
|
|
|
giststate->supportCollation[nkey],
|
|
|
|
PointerGetDatum(&fentry)));
|
|
|
|
|
|
|
|
/* fetchFn set 'key', return it to the caller */
|
|
|
|
return fep->key;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fetch all keys in tuple.
|
|
|
|
* Returns a new HeapTuple containing the originally-indexed data.
|
|
|
|
*/
|
|
|
|
HeapTuple
|
|
|
|
gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple)
|
|
|
|
{
|
|
|
|
MemoryContext oldcxt = MemoryContextSwitchTo(giststate->tempCxt);
|
|
|
|
Datum fetchatt[INDEX_MAX_KEYS];
|
|
|
|
bool isnull[INDEX_MAX_KEYS];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < r->rd_att->natts; i++)
|
|
|
|
{
|
|
|
|
Datum datum;
|
|
|
|
|
|
|
|
datum = index_getattr(tuple, i + 1, giststate->tupdesc, &isnull[i]);
|
|
|
|
|
|
|
|
if (giststate->fetchFn[i].fn_oid != InvalidOid)
|
|
|
|
{
|
|
|
|
if (!isnull[i])
|
|
|
|
fetchatt[i] = gistFetchAtt(giststate, i, datum, r);
|
|
|
|
else
|
|
|
|
fetchatt[i] = (Datum) 0;
|
|
|
|
}
|
|
|
|
else if (giststate->compressFn[i].fn_oid == InvalidOid)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* If opclass does not provide compress method that could change
|
|
|
|
* original value, att is necessarily stored in original form.
|
|
|
|
*/
|
|
|
|
if (!isnull[i])
|
|
|
|
fetchatt[i] = datum;
|
|
|
|
else
|
|
|
|
fetchatt[i] = (Datum) 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Index-only scans not supported for this column. Since the
|
|
|
|
* planner chose an index-only scan anyway, it is not interested
|
|
|
|
* in this column, and we can replace it with a NULL.
|
|
|
|
*/
|
|
|
|
isnull[i] = true;
|
|
|
|
fetchatt[i] = (Datum) 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
|
|
|
|
|
|
|
return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull);
|
|
|
|
}
|
|
|
|
|
|
|
|
float
|
|
|
|
gistpenalty(GISTSTATE *giststate, int attno,
|
|
|
|
GISTENTRY *orig, bool isNullOrig,
|
|
|
|
GISTENTRY *add, bool isNullAdd)
|
|
|
|
{
|
|
|
|
float penalty = 0.0;
|
|
|
|
|
|
|
|
if (giststate->penaltyFn[attno].fn_strict == false ||
|
|
|
|
(isNullOrig == false && isNullAdd == false))
|
|
|
|
{
|
|
|
|
FunctionCall3Coll(&giststate->penaltyFn[attno],
|
|
|
|
giststate->supportCollation[attno],
|
|
|
|
PointerGetDatum(orig),
|
|
|
|
PointerGetDatum(add),
|
|
|
|
PointerGetDatum(&penalty));
|
|
|
|
/* disallow negative or NaN penalty */
|
|
|
|
if (isnan(penalty) || penalty < 0.0)
|
|
|
|
penalty = 0.0;
|
|
|
|
}
|
|
|
|
else if (isNullOrig && isNullAdd)
|
|
|
|
penalty = 0.0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* try to prevent mixing null and non-null values */
|
|
|
|
penalty = get_float4_infinity();
|
|
|
|
}
|
|
|
|
|
|
|
|
return penalty;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a new index page
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
GISTInitBuffer(Buffer b, uint32 f)
|
|
|
|
{
|
|
|
|
GISTPageOpaque opaque;
|
|
|
|
Page page;
|
|
|
|
Size pageSize;
|
|
|
|
|
|
|
|
pageSize = BufferGetPageSize(b);
|
|
|
|
page = BufferGetPage(b);
|
|
|
|
PageInit(page, pageSize, sizeof(GISTPageOpaqueData));
|
|
|
|
|
|
|
|
opaque = GistPageGetOpaque(page);
|
|
|
|
/* page was already zeroed by PageInit, so this is not needed: */
|
|
|
|
/* memset(&(opaque->nsn), 0, sizeof(GistNSN)); */
|
|
|
|
opaque->rightlink = InvalidBlockNumber;
|
|
|
|
opaque->flags = f;
|
|
|
|
opaque->gist_page_id = GIST_PAGE_ID;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that a freshly-read page looks sane.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
gistcheckpage(Relation rel, Buffer buf)
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buf);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ReadBuffer verifies that every newly-read page passes
|
|
|
|
* PageHeaderIsValid, which means it either contains a reasonably sane
|
|
|
|
* page header or is all-zero. We have to defend against the all-zero
|
|
|
|
* case, however.
|
|
|
|
*/
|
|
|
|
if (PageIsNew(page))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.
By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis. However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent. That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.
This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
errmsg("index \"%s\" contains unexpected zero page at block %u",
|
|
|
|
RelationGetRelationName(rel),
|
|
|
|
BufferGetBlockNumber(buf)),
|
|
|
|
errhint("Please REINDEX it.")));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Additionally check that the special area looks sane.
|
|
|
|
*/
|
|
|
|
if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INDEX_CORRUPTED),
|
|
|
|
errmsg("index \"%s\" contains corrupted page at block %u",
|
|
|
|
RelationGetRelationName(rel),
|
|
|
|
BufferGetBlockNumber(buf)),
|
|
|
|
errhint("Please REINDEX it.")));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a new page (either by recycling, or by extending the index file)
|
|
|
|
*
|
|
|
|
* The returned buffer is already pinned and exclusive-locked
|
|
|
|
*
|
|
|
|
* Caller is responsible for initializing the page by calling GISTInitBuffer
|
|
|
|
*/
|
|
|
|
Buffer
|
|
|
|
gistNewBuffer(Relation r)
|
|
|
|
{
|
|
|
|
Buffer buffer;
|
|
|
|
bool needLock;
|
|
|
|
|
|
|
|
/* First, try to get a page from FSM */
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
BlockNumber blkno = GetFreeIndexPage(r);
|
|
|
|
|
|
|
|
if (blkno == InvalidBlockNumber)
|
|
|
|
break; /* nothing left in FSM */
|
|
|
|
|
|
|
|
buffer = ReadBuffer(r, blkno);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have to guard against the possibility that someone else already
|
|
|
|
* recycled this page; the buffer may be locked if so.
|
|
|
|
*/
|
|
|
|
if (ConditionalLockBuffer(buffer))
|
|
|
|
{
|
|
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
|
|
|
|
if (PageIsNew(page))
|
|
|
|
return buffer; /* OK to use, if never initialized */
|
|
|
|
|
|
|
|
gistcheckpage(r, buffer);
|
|
|
|
|
|
|
|
if (GistPageIsDeleted(page))
|
|
|
|
return buffer; /* OK to use */
|
|
|
|
|
|
|
|
LockBuffer(buffer, GIST_UNLOCK);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Can't use it, so release buffer and try again */
|
|
|
|
ReleaseBuffer(buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Must extend the file */
|
|
|
|
needLock = !RELATION_IS_LOCAL(r);
|
|
|
|
|
|
|
|
if (needLock)
|
|
|
|
LockRelationForExtension(r, ExclusiveLock);
|
|
|
|
|
|
|
|
buffer = ReadBuffer(r, P_NEW);
|
|
|
|
LockBuffer(buffer, GIST_EXCLUSIVE);
|
|
|
|
|
|
|
|
if (needLock)
|
|
|
|
UnlockRelationForExtension(r, ExclusiveLock);
|
|
|
|
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
bytea *
|
|
|
|
gistoptions(Datum reloptions, bool validate)
|
|
|
|
{
|
|
|
|
relopt_value *options;
|
|
|
|
GiSTOptions *rdopts;
|
|
|
|
int numoptions;
|
|
|
|
static const relopt_parse_elt tab[] = {
|
|
|
|
{"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
|
|
|
|
{"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
|
|
|
|
};
|
|
|
|
|
|
|
|
options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
|
|
|
|
&numoptions);
|
|
|
|
|
|
|
|
/* if none set, we're done */
|
|
|
|
if (numoptions == 0)
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
return NULL;
|
|
|
|
|
|
|
|
rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
|
|
|
|
|
|
|
|
fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
|
|
|
|
validate, tab, lengthof(tab));
|
|
|
|
|
|
|
|
pfree(options);
|
|
|
|
|
Restructure index access method API to hide most of it at the C level.
This patch reduces pg_am to just two columns, a name and a handler
function. All the data formerly obtained from pg_am is now provided
in a C struct returned by the handler function. This is similar to
the designs we've adopted for FDWs and tablesample methods. There
are multiple advantages. For one, the index AM's support functions
are now simple C functions, making them faster to call and much less
error-prone, since the C compiler can now check function signatures.
For another, this will make it far more practical to define index access
methods in installable extensions.
A disadvantage is that SQL-level code can no longer see attributes
of index AMs; in particular, some of the crosschecks in the opr_sanity
regression test are no longer possible from SQL. We've addressed that
by adding a facility for the index AM to perform such checks instead.
(Much more could be done in that line, but for now we're content if the
amvalidate functions more or less replace what opr_sanity used to do.)
We might also want to expose some sort of reporting functionality, but
this patch doesn't do that.
Alexander Korotkov, reviewed by Petr Jelínek, and rather heavily
editorialized on by me.
10 years ago
|
|
|
return (bytea *) rdopts;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* gistproperty() -- Check boolean properties of indexes.
|
|
|
|
*
|
|
|
|
* This is optional for most AMs, but is required for GiST because the core
|
|
|
|
* property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle
|
|
|
|
* AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
gistproperty(Oid index_oid, int attno,
|
|
|
|
IndexAMProperty prop, const char *propname,
|
|
|
|
bool *res, bool *isnull)
|
|
|
|
{
|
|
|
|
Oid opclass,
|
|
|
|
opfamily,
|
|
|
|
opcintype;
|
|
|
|
int16 procno;
|
|
|
|
|
|
|
|
/* Only answer column-level inquiries */
|
|
|
|
if (attno == 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Currently, GiST distance-ordered scans require that there be a distance
|
|
|
|
* function in the opclass with the default types (i.e. the one loaded
|
|
|
|
* into the relcache entry, see initGISTstate). So we assume that if such
|
|
|
|
* a function exists, then there's a reason for it (rather than grubbing
|
|
|
|
* through all the opfamily's operators to find an ordered one).
|
|
|
|
*
|
|
|
|
* Essentially the same code can test whether we support returning the
|
|
|
|
* column data, since that's true if the opclass provides a fetch proc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
switch (prop)
|
|
|
|
{
|
|
|
|
case AMPROP_DISTANCE_ORDERABLE:
|
|
|
|
procno = GIST_DISTANCE_PROC;
|
|
|
|
break;
|
|
|
|
case AMPROP_RETURNABLE:
|
|
|
|
procno = GIST_FETCH_PROC;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First we need to know the column's opclass. */
|
|
|
|
opclass = get_index_column_opclass(index_oid, attno);
|
|
|
|
if (!OidIsValid(opclass))
|
|
|
|
{
|
|
|
|
*isnull = true;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Now look up the opclass family and input datatype. */
|
|
|
|
if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype))
|
|
|
|
{
|
|
|
|
*isnull = true;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* And now we can check whether the function is provided. */
|
|
|
|
|
|
|
|
*res = SearchSysCacheExists4(AMPROCNUM,
|
|
|
|
ObjectIdGetDatum(opfamily),
|
|
|
|
ObjectIdGetDatum(opcintype),
|
|
|
|
ObjectIdGetDatum(opcintype),
|
|
|
|
Int16GetDatum(procno));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special case: even without a fetch function, AMPROP_RETURNABLE is true
|
|
|
|
* if the opclass has no compress function.
|
|
|
|
*/
|
|
|
|
if (prop == AMPROP_RETURNABLE && !*res)
|
|
|
|
{
|
|
|
|
*res = !SearchSysCacheExists4(AMPROCNUM,
|
|
|
|
ObjectIdGetDatum(opfamily),
|
|
|
|
ObjectIdGetDatum(opcintype),
|
|
|
|
ObjectIdGetDatum(opcintype),
|
|
|
|
Int16GetDatum(GIST_COMPRESS_PROC));
|
|
|
|
}
|
|
|
|
|
|
|
|
*isnull = false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Temporary and unlogged GiST indexes are not WAL-logged, but we need LSNs
|
|
|
|
* to detect concurrent page splits anyway. This function provides a fake
|
|
|
|
* sequence of LSNs for that purpose.
|
|
|
|
*/
|
|
|
|
XLogRecPtr
|
|
|
|
gistGetFakeLSN(Relation rel)
|
|
|
|
{
|
|
|
|
static XLogRecPtr counter = 1;
|
|
|
|
|
|
|
|
if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Temporary relations are only accessible in our session, so a simple
|
|
|
|
* backend-local counter will do.
|
|
|
|
*/
|
|
|
|
return counter++;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Unlogged relations are accessible from other backends, and survive
|
|
|
|
* (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us.
|
|
|
|
*/
|
|
|
|
Assert(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
|
|
|
|
return GetFakeLSNForUnloggedRel();
|
|
|
|
}
|
|
|
|
}
|