mirror of https://github.com/postgres/postgres
BRIN is a new index access method intended to accelerate scans of very large tables, without the maintenance overhead of btrees or other traditional indexes. They work by maintaining "summary" data about block ranges. Bitmap index scans work by reading each summary tuple and comparing them with the query quals; all pages in the range are returned in a lossy TID bitmap if the quals are consistent with the values in the summary tuple, otherwise not. Normal index scans are not supported because these indexes do not store TIDs. As new tuples are added into the index, the summary information is updated (if the block range in which the tuple is added is already summarized) or not; in the latter case, a subsequent pass of VACUUM or the brin_summarize_new_values() function will create the summary information. For data types with natural 1-D sort orders, the summary info consists of the maximum and the minimum values of each indexed column within each page range. This type of operator class we call "Minmax", and we supply a bunch of them for most data types with B-tree opclasses. Since the BRIN code is generalized, other approaches are possible for things such as arrays, geometric types, ranges, etc; even for things such as enum types we could do something different than minmax with better results. In this commit I only include minmax. Catalog version bumped due to new builtin catalog entries. There's more that could be done here, but this is a good step forwards. Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera, with contribution by Heikki Linnakangas. Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas. Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo. PS: The research leading to these results has received funding from the European Union's Seventh Framework Programme (FP7/2007-2013) under grant agreement n° 318633.pull/14/head
parent
1961b1c131
commit
7516f52594
@ -0,0 +1,414 @@ |
||||
/*
|
||||
* brinfuncs.c |
||||
* Functions to investigate BRIN indexes |
||||
* |
||||
* Copyright (c) 2014, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/pageinspect/brinfuncs.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/htup_details.h" |
||||
#include "access/brin.h" |
||||
#include "access/brin_internal.h" |
||||
#include "access/brin_page.h" |
||||
#include "access/brin_revmap.h" |
||||
#include "access/brin_tuple.h" |
||||
#include "catalog/index.h" |
||||
#include "catalog/pg_type.h" |
||||
#include "funcapi.h" |
||||
#include "lib/stringinfo.h" |
||||
#include "utils/array.h" |
||||
#include "utils/builtins.h" |
||||
#include "utils/lsyscache.h" |
||||
#include "utils/rel.h" |
||||
#include "miscadmin.h" |
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(brin_page_type); |
||||
PG_FUNCTION_INFO_V1(brin_page_items); |
||||
PG_FUNCTION_INFO_V1(brin_metapage_info); |
||||
PG_FUNCTION_INFO_V1(brin_revmap_data); |
||||
|
||||
typedef struct brin_column_state |
||||
{ |
||||
int nstored; |
||||
FmgrInfo outputFn[FLEXIBLE_ARRAY_MEMBER]; |
||||
} brin_column_state; |
||||
|
||||
typedef struct brin_page_state |
||||
{ |
||||
BrinDesc *bdesc; |
||||
Page page; |
||||
OffsetNumber offset; |
||||
bool unusedItem; |
||||
bool done; |
||||
AttrNumber attno; |
||||
BrinMemTuple *dtup; |
||||
brin_column_state *columns[FLEXIBLE_ARRAY_MEMBER]; |
||||
} brin_page_state; |
||||
|
||||
|
||||
static Page verify_brin_page(bytea *raw_page, uint16 type, |
||||
const char *strtype); |
||||
|
||||
Datum |
||||
brin_page_type(PG_FUNCTION_ARGS) |
||||
{ |
||||
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||
Page page = VARDATA(raw_page); |
||||
BrinSpecialSpace *special; |
||||
char *type; |
||||
|
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(page); |
||||
|
||||
switch (special->type) |
||||
{ |
||||
case BRIN_PAGETYPE_META: |
||||
type = "meta"; |
||||
break; |
||||
case BRIN_PAGETYPE_REVMAP: |
||||
type = "revmap"; |
||||
break; |
||||
case BRIN_PAGETYPE_REGULAR: |
||||
type = "regular"; |
||||
break; |
||||
default: |
||||
type = psprintf("unknown (%02x)", special->type); |
||||
break; |
||||
} |
||||
|
||||
PG_RETURN_TEXT_P(cstring_to_text(type)); |
||||
} |
||||
|
||||
/*
|
||||
* Verify that the given bytea contains a BRIN page of the indicated page |
||||
* type, or die in the attempt. A pointer to the page is returned. |
||||
*/ |
||||
static Page |
||||
verify_brin_page(bytea *raw_page, uint16 type, const char *strtype) |
||||
{ |
||||
Page page; |
||||
int raw_page_size; |
||||
BrinSpecialSpace *special; |
||||
|
||||
raw_page_size = VARSIZE(raw_page) - VARHDRSZ; |
||||
|
||||
if (raw_page_size < SizeOfPageHeaderData) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
||||
errmsg("input page too small"), |
||||
errdetail("Expected size %d, got %d", raw_page_size, BLCKSZ))); |
||||
|
||||
page = VARDATA(raw_page); |
||||
|
||||
/* verify the special space says this page is what we want */ |
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(page); |
||||
if (special->type != type) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
||||
errmsg("page is not a BRIN page of type \"%s\"", strtype), |
||||
errdetail("Expected special type %08x, got %08x.", |
||||
type, special->type))); |
||||
|
||||
return page; |
||||
} |
||||
|
||||
|
||||
/*
|
||||
* Extract all item values from a BRIN index page |
||||
* |
||||
* Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); |
||||
*/ |
||||
Datum |
||||
brin_page_items(PG_FUNCTION_ARGS) |
||||
{ |
||||
brin_page_state *state; |
||||
FuncCallContext *fctx; |
||||
|
||||
if (!superuser()) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), |
||||
(errmsg("must be superuser to use raw page functions")))); |
||||
|
||||
if (SRF_IS_FIRSTCALL()) |
||||
{ |
||||
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||
Oid indexRelid = PG_GETARG_OID(1); |
||||
Page page; |
||||
TupleDesc tupdesc; |
||||
MemoryContext mctx; |
||||
Relation indexRel; |
||||
AttrNumber attno; |
||||
|
||||
/* minimally verify the page we got */ |
||||
page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); |
||||
|
||||
/* create a function context for cross-call persistence */ |
||||
fctx = SRF_FIRSTCALL_INIT(); |
||||
|
||||
/* switch to memory context appropriate for multiple function calls */ |
||||
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); |
||||
|
||||
/* Build a tuple descriptor for our result type */ |
||||
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) |
||||
elog(ERROR, "return type must be a row type"); |
||||
|
||||
indexRel = index_open(indexRelid, AccessShareLock); |
||||
|
||||
state = palloc(offsetof(brin_page_state, columns) + |
||||
sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts); |
||||
|
||||
state->bdesc = brin_build_desc(indexRel); |
||||
state->page = page; |
||||
state->offset = FirstOffsetNumber; |
||||
state->unusedItem = false; |
||||
state->done = false; |
||||
state->dtup = NULL; |
||||
|
||||
/*
|
||||
* Initialize output functions for all indexed datatypes; simplifies |
||||
* calling them later. |
||||
*/ |
||||
for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++) |
||||
{ |
||||
Oid output; |
||||
bool isVarlena; |
||||
BrinOpcInfo *opcinfo; |
||||
int i; |
||||
brin_column_state *column; |
||||
|
||||
opcinfo = state->bdesc->bd_info[attno - 1]; |
||||
column = palloc(offsetof(brin_column_state, outputFn) + |
||||
sizeof(FmgrInfo) * opcinfo->oi_nstored); |
||||
|
||||
column->nstored = opcinfo->oi_nstored; |
||||
for (i = 0; i < opcinfo->oi_nstored; i++) |
||||
{ |
||||
getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena); |
||||
fmgr_info(output, &column->outputFn[i]); |
||||
} |
||||
|
||||
state->columns[attno - 1] = column; |
||||
} |
||||
|
||||
index_close(indexRel, AccessShareLock); |
||||
|
||||
fctx->user_fctx = state; |
||||
fctx->tuple_desc = BlessTupleDesc(tupdesc); |
||||
|
||||
MemoryContextSwitchTo(mctx); |
||||
} |
||||
|
||||
fctx = SRF_PERCALL_SETUP(); |
||||
state = fctx->user_fctx; |
||||
|
||||
if (!state->done) |
||||
{ |
||||
HeapTuple result; |
||||
Datum values[7]; |
||||
bool nulls[7]; |
||||
|
||||
/*
|
||||
* This loop is called once for every attribute of every tuple in the |
||||
* page. At the start of a tuple, we get a NULL dtup; that's our |
||||
* signal for obtaining and decoding the next one. If that's not the |
||||
* case, we output the next attribute. |
||||
*/ |
||||
if (state->dtup == NULL) |
||||
{ |
||||
BrinTuple *tup; |
||||
MemoryContext mctx; |
||||
ItemId itemId; |
||||
|
||||
/* deformed tuple must live across calls */ |
||||
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); |
||||
|
||||
/* verify item status: if there's no data, we can't decode */ |
||||
itemId = PageGetItemId(state->page, state->offset); |
||||
if (ItemIdIsUsed(itemId)) |
||||
{ |
||||
tup = (BrinTuple *) PageGetItem(state->page, |
||||
PageGetItemId(state->page, |
||||
state->offset)); |
||||
state->dtup = brin_deform_tuple(state->bdesc, tup); |
||||
state->attno = 1; |
||||
state->unusedItem = false; |
||||
} |
||||
else |
||||
state->unusedItem = true; |
||||
|
||||
MemoryContextSwitchTo(mctx); |
||||
} |
||||
else |
||||
state->attno++; |
||||
|
||||
MemSet(nulls, 0, sizeof(nulls)); |
||||
|
||||
if (state->unusedItem) |
||||
{ |
||||
values[0] = UInt16GetDatum(state->offset); |
||||
nulls[1] = true; |
||||
nulls[2] = true; |
||||
nulls[3] = true; |
||||
nulls[4] = true; |
||||
nulls[5] = true; |
||||
nulls[6] = true; |
||||
} |
||||
else |
||||
{ |
||||
int att = state->attno - 1; |
||||
|
||||
values[0] = UInt16GetDatum(state->offset); |
||||
values[1] = UInt32GetDatum(state->dtup->bt_blkno); |
||||
values[2] = UInt16GetDatum(state->attno); |
||||
values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls); |
||||
values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls); |
||||
values[5] = BoolGetDatum(state->dtup->bt_placeholder); |
||||
if (!state->dtup->bt_columns[att].bv_allnulls) |
||||
{ |
||||
BrinValues *bvalues = &state->dtup->bt_columns[att]; |
||||
StringInfoData s; |
||||
bool first; |
||||
int i; |
||||
|
||||
initStringInfo(&s); |
||||
appendStringInfoChar(&s, '{'); |
||||
|
||||
first = true; |
||||
for (i = 0; i < state->columns[att]->nstored; i++) |
||||
{ |
||||
char *val; |
||||
|
||||
if (!first) |
||||
appendStringInfoString(&s, " .. "); |
||||
first = false; |
||||
val = OutputFunctionCall(&state->columns[att]->outputFn[i], |
||||
bvalues->bv_values[i]); |
||||
appendStringInfoString(&s, val); |
||||
pfree(val); |
||||
} |
||||
appendStringInfoChar(&s, '}'); |
||||
|
||||
values[6] = CStringGetTextDatum(s.data); |
||||
pfree(s.data); |
||||
} |
||||
else |
||||
{ |
||||
nulls[6] = true; |
||||
} |
||||
} |
||||
|
||||
result = heap_form_tuple(fctx->tuple_desc, values, nulls); |
||||
|
||||
/*
|
||||
* If the item was unused, jump straight to the next one; otherwise, |
||||
* the only cleanup needed here is to set our signal to go to the next |
||||
* tuple in the following iteration, by freeing the current one. |
||||
*/ |
||||
if (state->unusedItem) |
||||
state->offset = OffsetNumberNext(state->offset); |
||||
else if (state->attno >= state->bdesc->bd_tupdesc->natts) |
||||
{ |
||||
pfree(state->dtup); |
||||
state->dtup = NULL; |
||||
state->offset = OffsetNumberNext(state->offset); |
||||
} |
||||
|
||||
/*
|
||||
* If we're beyond the end of the page, set flag to end the function in |
||||
* the following iteration. |
||||
*/ |
||||
if (state->offset > PageGetMaxOffsetNumber(state->page)) |
||||
state->done = true; |
||||
|
||||
SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result)); |
||||
} |
||||
|
||||
brin_free_desc(state->bdesc); |
||||
|
||||
SRF_RETURN_DONE(fctx); |
||||
} |
||||
|
||||
Datum |
||||
brin_metapage_info(PG_FUNCTION_ARGS) |
||||
{ |
||||
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||
Page page; |
||||
BrinMetaPageData *meta; |
||||
TupleDesc tupdesc; |
||||
Datum values[4]; |
||||
bool nulls[4]; |
||||
HeapTuple htup; |
||||
|
||||
page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage"); |
||||
|
||||
/* Build a tuple descriptor for our result type */ |
||||
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) |
||||
elog(ERROR, "return type must be a row type"); |
||||
tupdesc = BlessTupleDesc(tupdesc); |
||||
|
||||
/* Extract values from the metapage */ |
||||
meta = (BrinMetaPageData *) PageGetContents(page); |
||||
MemSet(nulls, 0, sizeof(nulls)); |
||||
values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic)); |
||||
values[1] = Int32GetDatum(meta->brinVersion); |
||||
values[2] = Int32GetDatum(meta->pagesPerRange); |
||||
values[3] = Int64GetDatum(meta->lastRevmapPage); |
||||
|
||||
htup = heap_form_tuple(tupdesc, values, nulls); |
||||
|
||||
PG_RETURN_DATUM(HeapTupleGetDatum(htup)); |
||||
} |
||||
|
||||
/*
|
||||
* Return the TID array stored in a BRIN revmap page |
||||
*/ |
||||
Datum |
||||
brin_revmap_data(PG_FUNCTION_ARGS) |
||||
{ |
||||
struct |
||||
{ |
||||
ItemPointerData *tids; |
||||
int idx; |
||||
} *state; |
||||
FuncCallContext *fctx; |
||||
|
||||
if (!superuser()) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), |
||||
(errmsg("must be superuser to use raw page functions")))); |
||||
|
||||
if (SRF_IS_FIRSTCALL()) |
||||
{ |
||||
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||
MemoryContext mctx; |
||||
Page page; |
||||
|
||||
/* minimally verify the page we got */ |
||||
page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap"); |
||||
|
||||
/* create a function context for cross-call persistence */ |
||||
fctx = SRF_FIRSTCALL_INIT(); |
||||
|
||||
/* switch to memory context appropriate for multiple function calls */ |
||||
mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); |
||||
|
||||
state = palloc(sizeof(*state)); |
||||
state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids; |
||||
state->idx = 0; |
||||
|
||||
fctx->user_fctx = state; |
||||
|
||||
MemoryContextSwitchTo(mctx); |
||||
} |
||||
|
||||
fctx = SRF_PERCALL_SETUP(); |
||||
state = fctx->user_fctx; |
||||
|
||||
if (state->idx < REVMAP_PAGE_MAXITEMS) |
||||
SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++])); |
||||
|
||||
SRF_RETURN_DONE(fctx); |
||||
} |
@ -0,0 +1,43 @@ |
||||
/* contrib/pageinspect/pageinspect--1.2--1.3.sql */ |
||||
|
||||
-- complain if script is sourced in psql, rather than via ALTER EXTENSION |
||||
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.3'" to load this file. \quit |
||||
|
||||
-- |
||||
-- brin_page_type() |
||||
-- |
||||
CREATE FUNCTION brin_page_type(IN page bytea) |
||||
RETURNS text |
||||
AS 'MODULE_PATHNAME', 'brin_page_type' |
||||
LANGUAGE C STRICT; |
||||
|
||||
-- |
||||
-- brin_metapage_info() |
||||
-- |
||||
CREATE FUNCTION brin_metapage_info(IN page bytea, OUT magic text, |
||||
OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint) |
||||
AS 'MODULE_PATHNAME', 'brin_metapage_info' |
||||
LANGUAGE C STRICT; |
||||
|
||||
-- |
||||
-- brin_revmap_data() |
||||
CREATE FUNCTION brin_revmap_data(IN page bytea, |
||||
OUT pages tid) |
||||
RETURNS SETOF tid |
||||
AS 'MODULE_PATHNAME', 'brin_revmap_data' |
||||
LANGUAGE C STRICT; |
||||
|
||||
-- |
||||
-- brin_page_items() |
||||
-- |
||||
CREATE FUNCTION brin_page_items(IN page bytea, IN index_oid regclass, |
||||
OUT itemoffset int, |
||||
OUT blknum int, |
||||
OUT attnum int, |
||||
OUT allnulls bool, |
||||
OUT hasnulls bool, |
||||
OUT placeholder bool, |
||||
OUT value text) |
||||
RETURNS SETOF record |
||||
AS 'MODULE_PATHNAME', 'brin_page_items' |
||||
LANGUAGE C STRICT; |
@ -1,5 +1,5 @@ |
||||
# pageinspect extension |
||||
comment = 'inspect the contents of database pages at a low level' |
||||
default_version = '1.2' |
||||
default_version = '1.3' |
||||
module_pathname = '$libdir/pageinspect' |
||||
relocatable = true |
||||
|
@ -0,0 +1,490 @@ |
||||
<!-- doc/src/sgml/brin.sgml --> |
||||
|
||||
<chapter id="BRIN"> |
||||
<title>BRIN Indexes</title> |
||||
|
||||
<indexterm> |
||||
<primary>index</primary> |
||||
<secondary>BRIN</secondary> |
||||
</indexterm> |
||||
|
||||
<sect1 id="brin-intro"> |
||||
<title>Introduction</title> |
||||
|
||||
<para> |
||||
<acronym>BRIN</acronym> stands for Block Range Index. |
||||
<acronym>BRIN</acronym> is designed for handling very large tables |
||||
in which certain columns have some natural correlation with their |
||||
physical location within the table. |
||||
A <firstterm>block range</> is a group of pages that are physically |
||||
adjacent in the table; for each block range, some summary info is stored |
||||
by the index. |
||||
For example, a table storing a store's sale orders might have |
||||
a date column on which each order was placed, and most of the time |
||||
the entries for earlier orders will appear earlier in the table as well; |
||||
a table storing a ZIP code column might have all codes for a city |
||||
grouped together naturally. |
||||
</para> |
||||
|
||||
<para> |
||||
<acronym>BRIN</acronym> indexes can satisfy queries via regular bitmap |
||||
index scans, and will return all tuples in all pages within each range if |
||||
the summary info stored by the index is <firstterm>consistent</> with the |
||||
query conditions. |
||||
The query executor is in charge of rechecking these tuples and discarding |
||||
those that do not match the query conditions — in other words, these |
||||
indexes are lossy. |
||||
Because a <acronym>BRIN</acronym> index is very small, scanning the index |
||||
adds little overhead compared to a sequential scan, but may avoid scanning |
||||
large parts of the table that are known not to contain matching tuples. |
||||
</para> |
||||
|
||||
<para> |
||||
The specific data that a <acronym>BRIN</acronym> index will store, |
||||
as well as the specific queries that the index will be able to satisfy, |
||||
depend on the operator class selected for each column of the index. |
||||
Data types having a linear sort order can have operator classes that |
||||
store the minimum and maximum value within each block range, for instance; |
||||
geometrical types might store the bounding box for all the objects |
||||
in the block range. |
||||
</para> |
||||
|
||||
<para> |
||||
The size of the block range is determined at index creation time by |
||||
the <literal>pages_per_range</> storage parameter. The number of index |
||||
entries will be equal to the size of the relation in pages divided by |
||||
the selected value for <literal>pages_per_range</>. Therefore, the smaller |
||||
the number, the larger the index becomes (because of the need to |
||||
store more index entries), but at the same time the summary data stored can |
||||
be more precise and more data blocks can be skipped during an index scan. |
||||
</para> |
||||
</sect1> |
||||
|
||||
<sect1 id="brin-builtin-opclasses"> |
||||
<title>Built-in Operator Classes</title> |
||||
|
||||
<para> |
||||
The core <productname>PostgreSQL</productname> distribution includes |
||||
includes the <acronym>BRIN</acronym> operator classes shown in |
||||
<xref linkend="brin-builtin-opclasses-table">. |
||||
</para> |
||||
|
||||
<para> |
||||
The <firstterm>minmax</> |
||||
operator classes store the minimum and the maximum values appearing |
||||
in the indexed column within the range. |
||||
</para> |
||||
|
||||
<table id="brin-builtin-opclasses-table"> |
||||
<title>Built-in <acronym>BRIN</acronym> Operator Classes</title> |
||||
<tgroup cols="3"> |
||||
<thead> |
||||
<row> |
||||
<entry>Name</entry> |
||||
<entry>Indexed Data Type</entry> |
||||
<entry>Indexable Operators</entry> |
||||
</row> |
||||
</thead> |
||||
<tbody> |
||||
<row> |
||||
<entry><literal>bytea_minmax_ops</literal></entry> |
||||
<entry><type>bytea</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>char_minmax_ops</literal></entry> |
||||
<entry><type>"char"</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>name_minmax_ops</literal></entry> |
||||
<entry><type>name</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>int8_minmax_ops</literal></entry> |
||||
<entry><type>bigint</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>int2_minmax_ops</literal></entry> |
||||
<entry><type>smallint</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>int4_minmax_ops</literal></entry> |
||||
<entry><type>integer</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>text_minmax_ops</literal></entry> |
||||
<entry><type>text</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>oid_minmax_ops</literal></entry> |
||||
<entry><type>oid</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>tid_minmax_ops</literal></entry> |
||||
<entry><type>tid</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>float4_minmax_ops</literal></entry> |
||||
<entry><type>real</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>float8_minmax_ops</literal></entry> |
||||
<entry><type>double precision</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>abstime_minmax_ops</literal></entry> |
||||
<entry><type>abstime</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>reltime_minmax_ops</literal></entry> |
||||
<entry><type>reltime</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>macaddr_minmax_ops</literal></entry> |
||||
<entry><type>macaddr</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>inet_minmax_ops</literal></entry> |
||||
<entry><type>inet</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>bpchar_minmax_ops</literal></entry> |
||||
<entry><type>character</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>date_minmax_ops</literal></entry> |
||||
<entry><type>date</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>time_minmax_ops</literal></entry> |
||||
<entry><type>time without time zone</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>timestamp_minmax_ops</literal></entry> |
||||
<entry><type>timestamp without time zone</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>timestamptz_minmax_ops</literal></entry> |
||||
<entry><type>timestamp with time zone</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>interval_minmax_ops</literal></entry> |
||||
<entry><type>interval</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>timetz_minmax_ops</literal></entry> |
||||
<entry><type>time with time zone</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>bit_minmax_ops</literal></entry> |
||||
<entry><type>bit</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>varbit_minmax_ops</literal></entry> |
||||
<entry><type>bit varying</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>numeric_minmax_ops</literal></entry> |
||||
<entry><type>numeric</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>uuid_minmax_ops</literal></entry> |
||||
<entry><type>uuid</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
<row> |
||||
<entry><literal>pg_lsn_minmax_ops</literal></entry> |
||||
<entry><type>pg_lsn</type></entry> |
||||
<entry> |
||||
<literal><</literal> |
||||
<literal><=</literal> |
||||
<literal>=</literal> |
||||
<literal>>=</literal> |
||||
<literal>></literal> |
||||
</entry> |
||||
</row> |
||||
</tbody> |
||||
</tgroup> |
||||
</table> |
||||
</sect1> |
||||
|
||||
<sect1 id="brin-extensibility"> |
||||
<title>Extensibility</title> |
||||
|
||||
<para> |
||||
The <acronym>BRIN</acronym> interface has a high level of abstraction, |
||||
requiring the access method implementer only to implement the semantics |
||||
of the data type being accessed. The <acronym>BRIN</acronym> layer |
||||
itself takes care of concurrency, logging and searching the index structure. |
||||
</para> |
||||
|
||||
<para> |
||||
All it takes to get a <acronym>BRIN</acronym> access method working is to |
||||
implement a few user-defined methods, which define the behavior of |
||||
summary values stored in the index and the way they interact with |
||||
scan keys. |
||||
In short, <acronym>BRIN</acronym> combines |
||||
extensibility with generality, code reuse, and a clean interface. |
||||
</para> |
||||
|
||||
<para> |
||||
There are four methods that an operator class for <acronym>BRIN</acronym> |
||||
must provide: |
||||
|
||||
<variablelist> |
||||
<varlistentry> |
||||
<term><function>BrinOpcInfo *opcInfo(Oid type_oid)</></term> |
||||
<listitem> |
||||
<para> |
||||
Returns internal information about the indexed columns' summary data. |
||||
The return value must point to a palloc'd <structname>BrinOpcInfo</>, |
||||
which has this definition: |
||||
<programlisting> |
||||
typedef struct BrinOpcInfo |
||||
{ |
||||
/* Number of columns stored in an index column of this opclass */ |
||||
uint16 oi_nstored; |
||||
|
||||
/* Opaque pointer for the opclass' private use */ |
||||
void *oi_opaque; |
||||
|
||||
/* Type IDs of the stored columns */ |
||||
Oid oi_typids[FLEXIBLE_ARRAY_MEMBER]; |
||||
} BrinOpcInfo; |
||||
</programlisting> |
||||
<structname>BrinOpcInfo</>.<structfield>oi_opaque</> can be used by the |
||||
operator class routines to pass information between support procedures |
||||
during an index scan. |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
|
||||
<varlistentry> |
||||
<term><function>bool consistent(BrinDesc *bdesc, BrinValues *column, |
||||
ScanKey key)</function></term> |
||||
<listitem> |
||||
<para> |
||||
Returns whether the ScanKey is consistent with the given indexed |
||||
values for a range. |
||||
The attribute number to use is passed as part of the scan key. |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
|
||||
<varlistentry> |
||||
<term><function>bool addValue(BrinDesc *bdesc, BrinValues *column, |
||||
Datum newval, bool isnull)</function></term> |
||||
<listitem> |
||||
<para> |
||||
Given an index tuple and an indexed value, modifies the indicated |
||||
attribute of the tuple so that it additionally represents the new value. |
||||
If any modification was done to the tuple, <literal>true</literal> is |
||||
returned. |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
|
||||
<varlistentry> |
||||
<term><function>bool unionTuples(BrinDesc *bdesc, BrinValues *a, |
||||
BrinValues *b)</function></term> |
||||
<listitem> |
||||
<para> |
||||
Consolidates two index tuples. Given two index tuples, modifies the |
||||
indicated attribute of the first of them so that it represents both tuples. |
||||
The second tuple is not modified. |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
</variablelist> |
||||
|
||||
To implement these methods in a generic way, the operator class |
||||
defines its own internal support functions. |
||||
(For instance, <quote>min/max</> operator classes implements |
||||
support functions for the four inequality operators for the data type.) |
||||
Additionally, the operator class must supply appropriate |
||||
operator entries, |
||||
to enable the optimizer to use the index when those operators are |
||||
used in queries. |
||||
</para> |
||||
</sect1> |
||||
</chapter> |
@ -0,0 +1,18 @@ |
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# Makefile--
|
||||
# Makefile for access/brin
|
||||
#
|
||||
# IDENTIFICATION
|
||||
# src/backend/access/brin/Makefile
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
subdir = src/backend/access/brin
|
||||
top_builddir = ../../../..
|
||||
include $(top_builddir)/src/Makefile.global |
||||
|
||||
OBJS = brin.o brin_pageops.o brin_revmap.o brin_tuple.o brin_xlog.o \
|
||||
brin_minmax.o
|
||||
|
||||
include $(top_srcdir)/src/backend/common.mk |
@ -0,0 +1,189 @@ |
||||
Block Range Indexes (BRIN) |
||||
========================== |
||||
|
||||
BRIN indexes intend to enable very fast scanning of extremely large tables. |
||||
|
||||
The essential idea of a BRIN index is to keep track of summarizing values in |
||||
consecutive groups of heap pages (page ranges); for example, the minimum and |
||||
maximum values for datatypes with a btree opclass, or the bounding box for |
||||
geometric types. These values can be used to avoid scanning such pages |
||||
during a table scan, depending on query quals. |
||||
|
||||
The cost of this is having to update the stored summary values of each page |
||||
range as tuples are inserted into them. |
||||
|
||||
|
||||
Access Method Design |
||||
-------------------- |
||||
|
||||
Since item pointers are not stored inside indexes of this type, it is not |
||||
possible to support the amgettuple interface. Instead, we only provide |
||||
amgetbitmap support. The amgetbitmap routine returns a lossy TIDBitmap |
||||
comprising all pages in those page ranges that match the query |
||||
qualifications. The recheck step in the BitmapHeapScan node prunes tuples |
||||
that are not visible according to the query qualifications. |
||||
|
||||
An operator class must have the following entries: |
||||
|
||||
- generic support procedures (pg_amproc), identical to all opclasses: |
||||
* "opcinfo" (BRIN_PROCNUM_OPCINFO) initializes a structure for index |
||||
creation or scanning |
||||
* "addValue" (BRIN_PROCNUM_ADDVALUE) takes an index tuple and a heap item, |
||||
and possibly changes the index tuple so that it includes the heap item |
||||
values |
||||
* "consistent" (BRIN_PROCNUM_CONSISTENT) takes an index tuple and query |
||||
quals, and returns whether the index tuple values match the query quals. |
||||
* "union" (BRIN_PROCNUM_UNION) takes two index tuples and modifies the first |
||||
one so that it represents the union of the two. |
||||
Procedure numbers up to 10 are reserved for future expansion. |
||||
|
||||
Additionally, each opclass needs additional support functions: |
||||
- Minmax-style operator classes: |
||||
* Proc numbers 11-14 are used for the functions implementing inequality |
||||
operators for the type, in this order: less than, less or equal, |
||||
greater or equal, greater than. |
||||
|
||||
Opclasses using a different design will require different additional procedure |
||||
numbers. |
||||
|
||||
Operator classes also need to have operator (pg_amop) entries so that the |
||||
optimizer can choose the index to execute queries. |
||||
- Minmax-style operator classes: |
||||
* The same operators as btree (<=, <, =, >=, >) |
||||
|
||||
Each index tuple stores some NULL bits and some opclass-specified values, which |
||||
are stored in a single null bitmask of length twice the number of columns. The |
||||
generic NULL bits indicate, for each column: |
||||
* bt_hasnulls: Whether there's any NULL value at all in the page range |
||||
* bt_allnulls: Whether all values are NULLs in the page range |
||||
|
||||
The opclass-specified values are: |
||||
- Minmax-style operator classes |
||||
* minimum value across all tuples in the range |
||||
* maximum value across all tuples in the range |
||||
|
||||
Note that the addValue and Union support procedures must be careful to |
||||
datumCopy() the values they want to store in the in-memory BRIN tuple, and |
||||
must pfree() the old copies when replacing older ones. Since some values |
||||
referenced from the tuple persist and others go away, there is no |
||||
well-defined lifetime for a memory context that would make this automatic. |
||||
|
||||
|
||||
The Range Map |
||||
------------- |
||||
|
||||
To find the index tuple for a particular page range, we have an internal |
||||
structure we call the range map, or "revmap" for short. This stores one TID |
||||
per page range, which is the address of the index tuple summarizing that |
||||
range. Since the map entries are fixed size, it is possible to compute the |
||||
address of the range map entry for any given heap page by simple arithmetic. |
||||
|
||||
When a new heap tuple is inserted in a summarized page range, we compare the |
||||
existing index tuple with the new heap tuple. If the heap tuple is outside |
||||
the summarization data given by the index tuple for any indexed column (or |
||||
if the new heap tuple contains null values but the index tuple indicates |
||||
there are no nulls), the index is updated with the new values. In many |
||||
cases it is possible to update the index tuple in-place, but if the new |
||||
index tuple is larger than the old one and there's not enough space in the |
||||
page, it is necessary to create a new index tuple with the new values. The |
||||
range map can be updated quickly to point to it; the old index tuple is |
||||
removed. |
||||
|
||||
If the range map points to an invalid TID, the corresponding page range is |
||||
considered to be not summarized. When tuples are added to unsummarized |
||||
pages, nothing needs to happen. |
||||
|
||||
To scan a table following a BRIN index, we scan the range map sequentially. |
||||
This yields index tuples in ascending page range order. Query quals are |
||||
matched to each index tuple; if they match, each page within the page range |
||||
is returned as part of the output TID bitmap. If there's no match, they are |
||||
skipped. Range map entries returning invalid index TIDs, that is |
||||
unsummarized page ranges, are also returned in the TID bitmap. |
||||
|
||||
The revmap is stored in the first few blocks of the index main fork, |
||||
immediately following the metapage. Whenever the revmap needs to be |
||||
extended by another page, existing tuples in that page are moved to some |
||||
other page. |
||||
|
||||
Heap tuples can be removed from anywhere without restriction. It might be |
||||
useful to mark the corresponding index tuple somehow, if the heap tuple is |
||||
one of the constraining values of the summary data (i.e. either min or max |
||||
in the case of a btree-opclass-bearing datatype), so that in the future we |
||||
are aware of the need to re-execute summarization on that range, leading to |
||||
a possible tightening of the summary values. |
||||
|
||||
Summarization |
||||
------------- |
||||
|
||||
At index creation time, the whole table is scanned; for each page range the |
||||
summarizing values of each indexed column and nulls bitmap are collected and |
||||
stored in the index. The partially-filled page range at the end of the |
||||
table is also summarized. |
||||
|
||||
As new tuples get inserted at the end of the table, they may update the |
||||
index tuple that summarizes the partial page range at the end. Eventually |
||||
that page range is complete and new tuples belong in a new page range that |
||||
hasn't yet been summarized. Those insertions do not create a new index |
||||
entry; instead, the page range remains unsummarized until later. |
||||
|
||||
Wehn VACUUM is run on the table, all unsummarized page ranges are |
||||
summarized. This action can also be invoked by the user via |
||||
brin_summarize_new_values(). Both these procedures scan all the |
||||
unsummarized ranges, and create a summary tuple. Again, this includes the |
||||
partially-filled page range at the end of the table. |
||||
|
||||
Vacuuming |
||||
--------- |
||||
|
||||
Since no heap TIDs are stored in a BRIN index, it's not necessary to scan the |
||||
index when heap tuples are removed. It might be that some summary values can |
||||
be tightened if heap tuples have been deleted; but this would represent an |
||||
optimization opportunity only, not a correctness issue. It's simpler to |
||||
represent this as the need to re-run summarization on the affected page range |
||||
rather than "subtracting" values from the existing one. This is not |
||||
currently implemented. |
||||
|
||||
Note that if there are no indexes on the table other than the BRIN index, |
||||
usage of maintenance_work_mem by vacuum can be decreased significantly, because |
||||
no detailed index scan needs to take place (and thus it's not necessary for |
||||
vacuum to save TIDs to remove). It's unlikely that BRIN would be the only |
||||
indexes in a table, though, because primary keys can be btrees only, and so |
||||
we don't implement this optimization. |
||||
|
||||
|
||||
Optimizer |
||||
--------- |
||||
|
||||
The optimizer selects the index based on the operator class' pg_amop |
||||
entries for the column. |
||||
|
||||
|
||||
Future improvements |
||||
------------------- |
||||
|
||||
* Different-size page ranges? |
||||
In the current design, each "index entry" in a BRIN index covers the same |
||||
number of pages. There's no hard reason for this; it might make sense to |
||||
allow the index to self-tune so that some index entries cover smaller page |
||||
ranges, if this allows the summary values to be more compact. This would incur |
||||
larger BRIN overhead for the index itself, but might allow better pruning of |
||||
page ranges during scan. In the limit of one index tuple per page, the index |
||||
itself would occupy too much space, even though we would be able to skip |
||||
reading the most heap pages, because the summary values are tight; in the |
||||
opposite limit of a single tuple that summarizes the whole table, we wouldn't |
||||
be able to prune anything even though the index is very small. This can |
||||
probably be made to work by using the range map as an index in itself. |
||||
|
||||
* More compact representation for TIDBitmap? |
||||
TIDBitmap is the structure used to represent bitmap scans. The |
||||
representation of lossy page ranges is not optimal for our purposes, because |
||||
it uses a Bitmapset to represent pages in the range; since we're going to return |
||||
all pages in a large range, it might be more convenient to allow for a |
||||
struct that uses start and end page numbers to represent the range, instead. |
||||
|
||||
* Better vacuuming? |
||||
It might be useful to enable passing more useful info to BRIN indexes during |
||||
vacuuming about tuples that are deleted, i.e. do not require the callback to |
||||
pass each tuple's TID. For instance we might need a callback that passes a |
||||
block number instead of a TID. That would help determine when to re-run |
||||
summarization on blocks that have seen lots of tuple deletions. |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,341 @@ |
||||
/*
|
||||
* brin_minmax.c |
||||
* Implementation of Min/Max opclass for BRIN |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/brin/brin_minmax.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/genam.h" |
||||
#include "access/brin_internal.h" |
||||
#include "access/brin_tuple.h" |
||||
#include "access/skey.h" |
||||
#include "catalog/pg_type.h" |
||||
#include "utils/datum.h" |
||||
#include "utils/lsyscache.h" |
||||
#include "utils/syscache.h" |
||||
|
||||
|
||||
/*
|
||||
* Procedure numbers must not collide with BRIN_PROCNUM defines in |
||||
* brin_internal.h. Note we only need inequality functions. |
||||
*/ |
||||
#define MINMAX_NUM_PROCNUMS 4 /* # support procs we need */ |
||||
#define PROCNUM_LESS 11 |
||||
#define PROCNUM_LESSEQUAL 12 |
||||
#define PROCNUM_GREATEREQUAL 13 |
||||
#define PROCNUM_GREATER 14 |
||||
|
||||
/*
|
||||
* Subtract this from procnum to obtain index in MinmaxOpaque arrays |
||||
* (Must be equal to minimum of private procnums) |
||||
*/ |
||||
#define PROCNUM_BASE 11 |
||||
|
||||
static FmgrInfo *minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, |
||||
uint16 procnum); |
||||
|
||||
PG_FUNCTION_INFO_V1(minmaxOpcInfo); |
||||
PG_FUNCTION_INFO_V1(minmaxAddValue); |
||||
PG_FUNCTION_INFO_V1(minmaxConsistent); |
||||
PG_FUNCTION_INFO_V1(minmaxUnion); |
||||
|
||||
|
||||
typedef struct MinmaxOpaque |
||||
{ |
||||
FmgrInfo operators[MINMAX_NUM_PROCNUMS]; |
||||
bool inited[MINMAX_NUM_PROCNUMS]; |
||||
} MinmaxOpaque; |
||||
|
||||
Datum |
||||
minmaxOpcInfo(PG_FUNCTION_ARGS) |
||||
{ |
||||
Oid typoid = PG_GETARG_OID(0); |
||||
BrinOpcInfo *result; |
||||
|
||||
/*
|
||||
* opaque->operators is initialized lazily, as indicated by 'inited' which |
||||
* is initialized to all false by palloc0. |
||||
*/ |
||||
|
||||
result = palloc0(MAXALIGN(SizeofBrinOpcInfo(2)) + |
||||
sizeof(MinmaxOpaque)); |
||||
result->oi_nstored = 2; |
||||
result->oi_opaque = (MinmaxOpaque *) |
||||
MAXALIGN((char *) result + SizeofBrinOpcInfo(2)); |
||||
result->oi_typids[0] = typoid; |
||||
result->oi_typids[1] = typoid; |
||||
|
||||
PG_RETURN_POINTER(result); |
||||
} |
||||
|
||||
/*
|
||||
* Examine the given index tuple (which contains partial status of a certain |
||||
* page range) by comparing it to the given value that comes from another heap |
||||
* tuple. If the new value is outside the min/max range specified by the |
||||
* existing tuple values, update the index tuple and return true. Otherwise, |
||||
* return false and do not modify in this case. |
||||
*/ |
||||
Datum |
||||
minmaxAddValue(PG_FUNCTION_ARGS) |
||||
{ |
||||
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); |
||||
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); |
||||
Datum newval = PG_GETARG_DATUM(2); |
||||
bool isnull = PG_GETARG_DATUM(3); |
||||
Oid colloid = PG_GET_COLLATION(); |
||||
FmgrInfo *cmpFn; |
||||
Datum compar; |
||||
bool updated = false; |
||||
Form_pg_attribute attr; |
||||
AttrNumber attno; |
||||
|
||||
/*
|
||||
* If the new value is null, we record that we saw it if it's the first |
||||
* one; otherwise, there's nothing to do. |
||||
*/ |
||||
if (isnull) |
||||
{ |
||||
if (column->bv_hasnulls) |
||||
PG_RETURN_BOOL(false); |
||||
|
||||
column->bv_hasnulls = true; |
||||
PG_RETURN_BOOL(true); |
||||
} |
||||
|
||||
attno = column->bv_attno; |
||||
attr = bdesc->bd_tupdesc->attrs[attno - 1]; |
||||
|
||||
/*
|
||||
* If the recorded value is null, store the new value (which we know to be |
||||
* not null) as both minimum and maximum, and we're done. |
||||
*/ |
||||
if (column->bv_allnulls) |
||||
{ |
||||
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); |
||||
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); |
||||
column->bv_allnulls = false; |
||||
PG_RETURN_BOOL(true); |
||||
} |
||||
|
||||
/*
|
||||
* Otherwise, need to compare the new value with the existing boundaries |
||||
* and update them accordingly. First check if it's less than the |
||||
* existing minimum. |
||||
*/ |
||||
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_LESS); |
||||
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[0]); |
||||
if (DatumGetBool(compar)) |
||||
{ |
||||
if (!attr->attbyval) |
||||
pfree(DatumGetPointer(column->bv_values[0])); |
||||
column->bv_values[0] = datumCopy(newval, attr->attbyval, attr->attlen); |
||||
updated = true; |
||||
} |
||||
|
||||
/*
|
||||
* And now compare it to the existing maximum. |
||||
*/ |
||||
cmpFn = minmax_get_procinfo(bdesc, attno, PROCNUM_GREATER); |
||||
compar = FunctionCall2Coll(cmpFn, colloid, newval, column->bv_values[1]); |
||||
if (DatumGetBool(compar)) |
||||
{ |
||||
if (!attr->attbyval) |
||||
pfree(DatumGetPointer(column->bv_values[1])); |
||||
column->bv_values[1] = datumCopy(newval, attr->attbyval, attr->attlen); |
||||
updated = true; |
||||
} |
||||
|
||||
PG_RETURN_BOOL(updated); |
||||
} |
||||
|
||||
/*
|
||||
* Given an index tuple corresponding to a certain page range and a scan key, |
||||
* return whether the scan key is consistent with the index tuple's min/max |
||||
* values. Return true if so, false otherwise. |
||||
*/ |
||||
Datum |
||||
minmaxConsistent(PG_FUNCTION_ARGS) |
||||
{ |
||||
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); |
||||
BrinValues *column = (BrinValues *) PG_GETARG_POINTER(1); |
||||
ScanKey key = (ScanKey) PG_GETARG_POINTER(2); |
||||
Oid colloid = PG_GET_COLLATION(); |
||||
AttrNumber attno; |
||||
Datum value; |
||||
Datum matches; |
||||
|
||||
Assert(key->sk_attno == column->bv_attno); |
||||
|
||||
/* handle IS NULL/IS NOT NULL tests */ |
||||
if (key->sk_flags & SK_ISNULL) |
||||
{ |
||||
if (key->sk_flags & SK_SEARCHNULL) |
||||
{ |
||||
if (column->bv_allnulls || column->bv_hasnulls) |
||||
PG_RETURN_BOOL(true); |
||||
PG_RETURN_BOOL(false); |
||||
} |
||||
|
||||
/*
|
||||
* For IS NOT NULL, we can only skip ranges that are known to have |
||||
* only nulls. |
||||
*/ |
||||
Assert(key->sk_flags & SK_SEARCHNOTNULL); |
||||
PG_RETURN_BOOL(!column->bv_allnulls); |
||||
} |
||||
|
||||
/* if the range is all empty, it cannot possibly be consistent */ |
||||
if (column->bv_allnulls) |
||||
PG_RETURN_BOOL(false); |
||||
|
||||
attno = key->sk_attno; |
||||
value = key->sk_argument; |
||||
switch (key->sk_strategy) |
||||
{ |
||||
case BTLessStrategyNumber: |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_LESS), |
||||
colloid, column->bv_values[0], value); |
||||
break; |
||||
case BTLessEqualStrategyNumber: |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_LESSEQUAL), |
||||
colloid, column->bv_values[0], value); |
||||
break; |
||||
case BTEqualStrategyNumber: |
||||
|
||||
/*
|
||||
* In the equality case (WHERE col = someval), we want to return |
||||
* the current page range if the minimum value in the range <= |
||||
* scan key, and the maximum value >= scan key. |
||||
*/ |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_LESSEQUAL), |
||||
colloid, column->bv_values[0], value); |
||||
if (!DatumGetBool(matches)) |
||||
break; |
||||
/* max() >= scankey */ |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_GREATEREQUAL), |
||||
colloid, column->bv_values[1], value); |
||||
break; |
||||
case BTGreaterEqualStrategyNumber: |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_GREATEREQUAL), |
||||
colloid, column->bv_values[1], value); |
||||
break; |
||||
case BTGreaterStrategyNumber: |
||||
matches = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_GREATER), |
||||
colloid, column->bv_values[1], value); |
||||
break; |
||||
default: |
||||
/* shouldn't happen */ |
||||
elog(ERROR, "invalid strategy number %d", key->sk_strategy); |
||||
matches = 0; |
||||
break; |
||||
} |
||||
|
||||
PG_RETURN_DATUM(matches); |
||||
} |
||||
|
||||
/*
|
||||
* Given two BrinValues, update the first of them as a union of the summary |
||||
* values contained in both. The second one is untouched. |
||||
*/ |
||||
Datum |
||||
minmaxUnion(PG_FUNCTION_ARGS) |
||||
{ |
||||
BrinDesc *bdesc = (BrinDesc *) PG_GETARG_POINTER(0); |
||||
BrinValues *col_a = (BrinValues *) PG_GETARG_POINTER(1); |
||||
BrinValues *col_b = (BrinValues *) PG_GETARG_POINTER(2); |
||||
Oid colloid = PG_GET_COLLATION(); |
||||
AttrNumber attno; |
||||
Form_pg_attribute attr; |
||||
bool needsadj; |
||||
|
||||
Assert(col_a->bv_attno == col_b->bv_attno); |
||||
|
||||
/* If there are no values in B, there's nothing to do */ |
||||
if (col_b->bv_allnulls) |
||||
PG_RETURN_VOID(); |
||||
|
||||
attno = col_a->bv_attno; |
||||
attr = bdesc->bd_tupdesc->attrs[attno - 1]; |
||||
|
||||
/* Adjust "hasnulls" */ |
||||
if (col_b->bv_hasnulls && !col_a->bv_hasnulls) |
||||
col_a->bv_hasnulls = true; |
||||
|
||||
/*
|
||||
* Adjust "allnulls". If B has values but A doesn't, just copy the values |
||||
* from B into A, and we're done. (We cannot run the operators in this |
||||
* case, because values in A might contain garbage.) |
||||
*/ |
||||
if (!col_b->bv_allnulls && col_a->bv_allnulls) |
||||
{ |
||||
col_a->bv_allnulls = false; |
||||
col_a->bv_values[0] = datumCopy(col_b->bv_values[0], |
||||
attr->attbyval, attr->attlen); |
||||
col_a->bv_values[1] = datumCopy(col_b->bv_values[1], |
||||
attr->attbyval, attr->attlen); |
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
/* Adjust minimum, if B's min is less than A's min */ |
||||
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_LESS), |
||||
colloid, col_b->bv_values[0], col_a->bv_values[0]); |
||||
if (needsadj) |
||||
{ |
||||
if (!attr->attbyval) |
||||
pfree(DatumGetPointer(col_a->bv_values[0])); |
||||
col_a->bv_values[0] = datumCopy(col_b->bv_values[0], |
||||
attr->attbyval, attr->attlen); |
||||
} |
||||
|
||||
/* Adjust maximum, if B's max is greater than A's max */ |
||||
needsadj = FunctionCall2Coll(minmax_get_procinfo(bdesc, attno, |
||||
PROCNUM_GREATER), |
||||
colloid, col_b->bv_values[1], col_a->bv_values[1]); |
||||
if (needsadj) |
||||
{ |
||||
if (!attr->attbyval) |
||||
pfree(DatumGetPointer(col_a->bv_values[1])); |
||||
col_a->bv_values[1] = datumCopy(col_b->bv_values[1], |
||||
attr->attbyval, attr->attlen); |
||||
} |
||||
|
||||
PG_RETURN_VOID(); |
||||
} |
||||
|
||||
/*
|
||||
* Return the procedure corresponding to the given function support number. |
||||
*/ |
||||
static FmgrInfo * |
||||
minmax_get_procinfo(BrinDesc *bdesc, uint16 attno, uint16 procnum) |
||||
{ |
||||
MinmaxOpaque *opaque; |
||||
uint16 basenum = procnum - PROCNUM_BASE; |
||||
|
||||
opaque = (MinmaxOpaque *) bdesc->bd_info[attno - 1]->oi_opaque; |
||||
|
||||
/*
|
||||
* We cache these in the opaque struct, to avoid repetitive syscache |
||||
* lookups. |
||||
*/ |
||||
if (!opaque->inited[basenum]) |
||||
{ |
||||
fmgr_info_copy(&opaque->operators[basenum], |
||||
index_getprocinfo(bdesc->bd_index, attno, procnum), |
||||
bdesc->bd_context); |
||||
opaque->inited[basenum] = true; |
||||
} |
||||
|
||||
return &opaque->operators[basenum]; |
||||
} |
@ -0,0 +1,723 @@ |
||||
/*
|
||||
* brin_pageops.c |
||||
* Page-handling routines for BRIN indexes |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/brin/brin_pageops.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/brin_pageops.h" |
||||
#include "access/brin_page.h" |
||||
#include "access/brin_revmap.h" |
||||
#include "access/brin_xlog.h" |
||||
#include "access/xloginsert.h" |
||||
#include "miscadmin.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/freespace.h" |
||||
#include "storage/lmgr.h" |
||||
#include "storage/smgr.h" |
||||
#include "utils/rel.h" |
||||
|
||||
|
||||
static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
||||
bool *was_extended); |
||||
static Size br_page_get_freespace(Page page); |
||||
|
||||
|
||||
/*
|
||||
* Update tuple origtup (size origsz), located in offset oldoff of buffer |
||||
* oldbuf, to newtup (size newsz) as summary tuple for the page range starting |
||||
* at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. |
||||
* |
||||
* If samepage is true, attempt to put the new tuple in the same page, but if |
||||
* there's no room, use some other one. |
||||
* |
||||
* If the update is successful, return true; the revmap is updated to point to |
||||
* the new tuple. If the update is not done for whatever reason, return false. |
||||
* Caller may retry the update if this happens. |
||||
*/ |
||||
bool |
||||
brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, BlockNumber heapBlk, |
||||
Buffer oldbuf, OffsetNumber oldoff, |
||||
const BrinTuple *origtup, Size origsz, |
||||
const BrinTuple *newtup, Size newsz, |
||||
bool samepage) |
||||
{ |
||||
Page oldpage; |
||||
ItemId oldlp; |
||||
BrinTuple *oldtup; |
||||
Size oldsz; |
||||
Buffer newbuf; |
||||
BrinSpecialSpace *special; |
||||
bool extended = false; |
||||
|
||||
newsz = MAXALIGN(newsz); |
||||
|
||||
/* make sure the revmap is long enough to contain the entry we need */ |
||||
brinRevmapExtend(revmap, heapBlk); |
||||
|
||||
if (!samepage) |
||||
{ |
||||
/* need a page on which to put the item */ |
||||
newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); |
||||
/* XXX delay vacuuming FSM until locks are released? */ |
||||
if (extended) |
||||
FreeSpaceMapVacuum(idxrel); |
||||
if (!BufferIsValid(newbuf)) |
||||
return false; |
||||
|
||||
/*
|
||||
* Note: it's possible (though unlikely) that the returned newbuf is |
||||
* the same as oldbuf, if brin_getinsertbuffer determined that the old |
||||
* buffer does in fact have enough space. |
||||
*/ |
||||
if (newbuf == oldbuf) |
||||
newbuf = InvalidBuffer; |
||||
} |
||||
else |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
||||
newbuf = InvalidBuffer; |
||||
} |
||||
oldpage = BufferGetPage(oldbuf); |
||||
oldlp = PageGetItemId(oldpage, oldoff); |
||||
|
||||
/*
|
||||
* Check that the old tuple wasn't updated concurrently: it might have |
||||
* moved someplace else entirely ... |
||||
*/ |
||||
if (!ItemIdIsNormal(oldlp)) |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
if (BufferIsValid(newbuf)) |
||||
UnlockReleaseBuffer(newbuf); |
||||
return false; |
||||
} |
||||
|
||||
oldsz = ItemIdGetLength(oldlp); |
||||
oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); |
||||
|
||||
/*
|
||||
* ... or it might have been updated in place to different contents. |
||||
*/ |
||||
if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
if (BufferIsValid(newbuf)) |
||||
UnlockReleaseBuffer(newbuf); |
||||
return false; |
||||
} |
||||
|
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(oldpage); |
||||
|
||||
/*
|
||||
* Great, the old tuple is intact. We can proceed with the update. |
||||
* |
||||
* If there's enough room in the old page for the new tuple, replace it. |
||||
* |
||||
* Note that there might now be enough space on the page even though the |
||||
* caller told us there isn't, if a concurrent update moved another tuple |
||||
* elsewhere or replaced a tuple with a smaller one. |
||||
*/ |
||||
if (((special->flags & BRIN_EVACUATE_PAGE) == 0) && |
||||
brin_can_do_samepage_update(oldbuf, origsz, newsz)) |
||||
{ |
||||
if (BufferIsValid(newbuf)) |
||||
UnlockReleaseBuffer(newbuf); |
||||
|
||||
START_CRIT_SECTION(); |
||||
PageIndexDeleteNoCompact(oldpage, &oldoff, 1); |
||||
if (PageAddItem(oldpage, (Item) newtup, newsz, oldoff, true, |
||||
false) == InvalidOffsetNumber) |
||||
elog(ERROR, "failed to add BRIN tuple"); |
||||
MarkBufferDirty(oldbuf); |
||||
|
||||
/* XLOG stuff */ |
||||
if (RelationNeedsWAL(idxrel)) |
||||
{ |
||||
BlockNumber blk = BufferGetBlockNumber(oldbuf); |
||||
xl_brin_samepage_update xlrec; |
||||
XLogRecPtr recptr; |
||||
XLogRecData rdata[2]; |
||||
uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; |
||||
|
||||
xlrec.node = idxrel->rd_node; |
||||
ItemPointerSetBlockNumber(&xlrec.tid, blk); |
||||
ItemPointerSetOffsetNumber(&xlrec.tid, oldoff); |
||||
rdata[0].data = (char *) &xlrec; |
||||
rdata[0].len = SizeOfBrinSamepageUpdate; |
||||
rdata[0].buffer = InvalidBuffer; |
||||
rdata[0].next = &(rdata[1]); |
||||
|
||||
rdata[1].data = (char *) newtup; |
||||
rdata[1].len = newsz; |
||||
rdata[1].buffer = oldbuf; |
||||
rdata[1].buffer_std = true; |
||||
rdata[1].next = NULL; |
||||
|
||||
recptr = XLogInsert(RM_BRIN_ID, info, rdata); |
||||
|
||||
PageSetLSN(oldpage, recptr); |
||||
} |
||||
|
||||
END_CRIT_SECTION(); |
||||
|
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
return true; |
||||
} |
||||
else if (newbuf == InvalidBuffer) |
||||
{ |
||||
/*
|
||||
* Not enough space, but caller said that there was. Tell them to |
||||
* start over. |
||||
*/ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
return false; |
||||
} |
||||
else |
||||
{ |
||||
/*
|
||||
* Not enough free space on the oldpage. Put the new tuple on the new |
||||
* page, and update the revmap. |
||||
*/ |
||||
Page newpage = BufferGetPage(newbuf); |
||||
Buffer revmapbuf; |
||||
ItemPointerData newtid; |
||||
OffsetNumber newoff; |
||||
|
||||
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
||||
|
||||
START_CRIT_SECTION(); |
||||
|
||||
PageIndexDeleteNoCompact(oldpage, &oldoff, 1); |
||||
newoff = PageAddItem(newpage, (Item) newtup, newsz, |
||||
InvalidOffsetNumber, false, false); |
||||
if (newoff == InvalidOffsetNumber) |
||||
elog(ERROR, "failed to add BRIN tuple to new page"); |
||||
MarkBufferDirty(oldbuf); |
||||
MarkBufferDirty(newbuf); |
||||
|
||||
ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); |
||||
brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); |
||||
MarkBufferDirty(revmapbuf); |
||||
|
||||
/* XLOG stuff */ |
||||
if (RelationNeedsWAL(idxrel)) |
||||
{ |
||||
xl_brin_update xlrec; |
||||
XLogRecPtr recptr; |
||||
XLogRecData rdata[4]; |
||||
uint8 info; |
||||
|
||||
info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
||||
|
||||
xlrec.new.node = idxrel->rd_node; |
||||
ItemPointerSet(&xlrec.new.tid, BufferGetBlockNumber(newbuf), newoff); |
||||
xlrec.new.heapBlk = heapBlk; |
||||
xlrec.new.tuplen = newsz; |
||||
xlrec.new.revmapBlk = BufferGetBlockNumber(revmapbuf); |
||||
xlrec.new.pagesPerRange = pagesPerRange; |
||||
ItemPointerSet(&xlrec.oldtid, BufferGetBlockNumber(oldbuf), oldoff); |
||||
|
||||
rdata[0].data = (char *) &xlrec; |
||||
rdata[0].len = SizeOfBrinUpdate; |
||||
rdata[0].buffer = InvalidBuffer; |
||||
rdata[0].next = &(rdata[1]); |
||||
|
||||
rdata[1].data = (char *) newtup; |
||||
rdata[1].len = newsz; |
||||
rdata[1].buffer = extended ? InvalidBuffer : newbuf; |
||||
rdata[1].buffer_std = true; |
||||
rdata[1].next = &(rdata[2]); |
||||
|
||||
rdata[2].data = (char *) NULL; |
||||
rdata[2].len = 0; |
||||
rdata[2].buffer = revmapbuf; |
||||
rdata[2].buffer_std = true; |
||||
rdata[2].next = &(rdata[3]); |
||||
|
||||
rdata[3].data = (char *) NULL; |
||||
rdata[3].len = 0; |
||||
rdata[3].buffer = oldbuf; |
||||
rdata[3].buffer_std = true; |
||||
rdata[3].next = NULL; |
||||
|
||||
recptr = XLogInsert(RM_BRIN_ID, info, rdata); |
||||
|
||||
PageSetLSN(oldpage, recptr); |
||||
PageSetLSN(newpage, recptr); |
||||
PageSetLSN(BufferGetPage(revmapbuf), recptr); |
||||
} |
||||
|
||||
END_CRIT_SECTION(); |
||||
|
||||
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
UnlockReleaseBuffer(newbuf); |
||||
return true; |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Return whether brin_doupdate can do a samepage update. |
||||
*/ |
||||
bool |
||||
brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) |
||||
{ |
||||
return |
||||
((newsz <= origsz) || |
||||
PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); |
||||
} |
||||
|
||||
/*
|
||||
* Insert an index tuple into the index relation. The revmap is updated to |
||||
* mark the range containing the given page as pointing to the inserted entry. |
||||
* A WAL record is written. |
||||
* |
||||
* The buffer, if valid, is first checked for free space to insert the new |
||||
* entry; if there isn't enough, a new buffer is obtained and pinned. No |
||||
* buffer lock must be held on entry, no buffer lock is held on exit. |
||||
* |
||||
* Return value is the offset number where the tuple was inserted. |
||||
*/ |
||||
OffsetNumber |
||||
brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, |
||||
BrinTuple *tup, Size itemsz) |
||||
{ |
||||
Page page; |
||||
BlockNumber blk; |
||||
OffsetNumber off; |
||||
Buffer revmapbuf; |
||||
ItemPointerData tid; |
||||
bool extended = false; |
||||
|
||||
itemsz = MAXALIGN(itemsz); |
||||
|
||||
/* Make sure the revmap is long enough to contain the entry we need */ |
||||
brinRevmapExtend(revmap, heapBlk); |
||||
|
||||
/*
|
||||
* Obtain a locked buffer to insert the new tuple. Note |
||||
* brin_getinsertbuffer ensures there's enough space in the returned |
||||
* buffer. |
||||
*/ |
||||
if (BufferIsValid(*buffer)) |
||||
{ |
||||
/*
|
||||
* It's possible that another backend (or ourselves!) extended the |
||||
* revmap over the page we held a pin on, so we cannot assume that |
||||
* it's still a regular page. |
||||
*/ |
||||
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) |
||||
{ |
||||
UnlockReleaseBuffer(*buffer); |
||||
*buffer = InvalidBuffer; |
||||
} |
||||
} |
||||
|
||||
if (!BufferIsValid(*buffer)) |
||||
{ |
||||
*buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); |
||||
Assert(BufferIsValid(*buffer)); |
||||
Assert(br_page_get_freespace(BufferGetPage(*buffer)) >= itemsz); |
||||
} |
||||
|
||||
/* Now obtain lock on revmap buffer */ |
||||
revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
||||
|
||||
page = BufferGetPage(*buffer); |
||||
blk = BufferGetBlockNumber(*buffer); |
||||
|
||||
START_CRIT_SECTION(); |
||||
off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, |
||||
false, false); |
||||
if (off == InvalidOffsetNumber) |
||||
elog(ERROR, "could not insert new index tuple to page"); |
||||
MarkBufferDirty(*buffer); |
||||
|
||||
BRIN_elog(DEBUG2, "inserted tuple (%u,%u) for range starting at %u", |
||||
blk, off, heapBlk); |
||||
|
||||
ItemPointerSet(&tid, blk, off); |
||||
brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); |
||||
MarkBufferDirty(revmapbuf); |
||||
|
||||
/* XLOG stuff */ |
||||
if (RelationNeedsWAL(idxrel)) |
||||
{ |
||||
xl_brin_insert xlrec; |
||||
XLogRecPtr recptr; |
||||
XLogRecData rdata[3]; |
||||
uint8 info; |
||||
|
||||
info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
||||
xlrec.node = idxrel->rd_node; |
||||
xlrec.heapBlk = heapBlk; |
||||
xlrec.pagesPerRange = pagesPerRange; |
||||
xlrec.revmapBlk = BufferGetBlockNumber(revmapbuf); |
||||
xlrec.tuplen = itemsz; |
||||
ItemPointerSet(&xlrec.tid, blk, off); |
||||
|
||||
rdata[0].data = (char *) &xlrec; |
||||
rdata[0].len = SizeOfBrinInsert; |
||||
rdata[0].buffer = InvalidBuffer; |
||||
rdata[0].buffer_std = false; |
||||
rdata[0].next = &(rdata[1]); |
||||
|
||||
rdata[1].data = (char *) tup; |
||||
rdata[1].len = itemsz; |
||||
rdata[1].buffer = extended ? InvalidBuffer : *buffer; |
||||
rdata[1].buffer_std = true; |
||||
rdata[1].next = &(rdata[2]); |
||||
|
||||
rdata[2].data = (char *) NULL; |
||||
rdata[2].len = 0; |
||||
rdata[2].buffer = revmapbuf; |
||||
rdata[2].buffer_std = false; |
||||
rdata[2].next = NULL; |
||||
|
||||
recptr = XLogInsert(RM_BRIN_ID, info, rdata); |
||||
|
||||
PageSetLSN(page, recptr); |
||||
PageSetLSN(BufferGetPage(revmapbuf), recptr); |
||||
} |
||||
|
||||
END_CRIT_SECTION(); |
||||
|
||||
/* Tuple is firmly on buffer; we can release our locks */ |
||||
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
||||
LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
||||
|
||||
if (extended) |
||||
FreeSpaceMapVacuum(idxrel); |
||||
|
||||
return off; |
||||
} |
||||
|
||||
/*
|
||||
* Initialize a page with the given type. |
||||
* |
||||
* Caller is responsible for marking it dirty, as appropriate. |
||||
*/ |
||||
void |
||||
brin_page_init(Page page, uint16 type) |
||||
{ |
||||
BrinSpecialSpace *special; |
||||
|
||||
PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); |
||||
|
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(page); |
||||
special->type = type; |
||||
} |
||||
|
||||
/*
|
||||
* Initialize a new BRIN index' metapage. |
||||
*/ |
||||
void |
||||
brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) |
||||
{ |
||||
BrinMetaPageData *metadata; |
||||
|
||||
brin_page_init(page, BRIN_PAGETYPE_META); |
||||
|
||||
metadata = (BrinMetaPageData *) PageGetContents(page); |
||||
|
||||
metadata->brinMagic = BRIN_META_MAGIC; |
||||
metadata->brinVersion = version; |
||||
metadata->pagesPerRange = pagesPerRange; |
||||
|
||||
/*
|
||||
* Note we cheat here a little. 0 is not a valid revmap block number |
||||
* (because it's the metapage buffer), but doing this enables the first |
||||
* revmap page to be created when the index is. |
||||
*/ |
||||
metadata->lastRevmapPage = 0; |
||||
} |
||||
|
||||
/*
|
||||
* Initiate page evacuation protocol. |
||||
* |
||||
* The page must be locked in exclusive mode by the caller. |
||||
* |
||||
* If the page is not yet initialized or empty, return false without doing |
||||
* anything; it can be used for revmap without any further changes. If it |
||||
* contains tuples, mark it for evacuation and return true. |
||||
*/ |
||||
bool |
||||
brin_start_evacuating_page(Relation idxRel, Buffer buf) |
||||
{ |
||||
OffsetNumber off; |
||||
OffsetNumber maxoff; |
||||
BrinSpecialSpace *special; |
||||
Page page; |
||||
|
||||
page = BufferGetPage(buf); |
||||
|
||||
if (PageIsNew(page)) |
||||
return false; |
||||
|
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(page); |
||||
|
||||
maxoff = PageGetMaxOffsetNumber(page); |
||||
for (off = FirstOffsetNumber; off <= maxoff; off++) |
||||
{ |
||||
ItemId lp; |
||||
|
||||
lp = PageGetItemId(page, off); |
||||
if (ItemIdIsUsed(lp)) |
||||
{ |
||||
/* prevent other backends from adding more stuff to this page */ |
||||
special->flags |= BRIN_EVACUATE_PAGE; |
||||
MarkBufferDirtyHint(buf, true); |
||||
|
||||
return true; |
||||
} |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
/*
|
||||
* Move all tuples out of a page. |
||||
* |
||||
* The caller must hold lock on the page. The lock and pin are released. |
||||
*/ |
||||
void |
||||
brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, Buffer buf) |
||||
{ |
||||
OffsetNumber off; |
||||
OffsetNumber maxoff; |
||||
Page page; |
||||
|
||||
page = BufferGetPage(buf); |
||||
|
||||
Assert(((BrinSpecialSpace *) |
||||
PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE); |
||||
|
||||
maxoff = PageGetMaxOffsetNumber(page); |
||||
for (off = FirstOffsetNumber; off <= maxoff; off++) |
||||
{ |
||||
BrinTuple *tup; |
||||
Size sz; |
||||
ItemId lp; |
||||
|
||||
CHECK_FOR_INTERRUPTS(); |
||||
|
||||
lp = PageGetItemId(page, off); |
||||
if (ItemIdIsUsed(lp)) |
||||
{ |
||||
sz = ItemIdGetLength(lp); |
||||
tup = (BrinTuple *) PageGetItem(page, lp); |
||||
tup = brin_copy_tuple(tup, sz); |
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
||||
|
||||
if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, |
||||
buf, off, tup, sz, tup, sz, false)) |
||||
off--; /* retry */ |
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_SHARE); |
||||
|
||||
/* It's possible that someone extended the revmap over this page */ |
||||
if (!BRIN_IS_REGULAR_PAGE(page)) |
||||
break; |
||||
} |
||||
} |
||||
|
||||
UnlockReleaseBuffer(buf); |
||||
} |
||||
|
||||
/*
|
||||
* Return a pinned and exclusively locked buffer which can be used to insert an |
||||
* index item of size itemsz. If oldbuf is a valid buffer, it is also locked |
||||
* (in a order determined to avoid deadlocks.) |
||||
* |
||||
* If there's no existing page with enough free space to accomodate the new |
||||
* item, the relation is extended. If this happens, *extended is set to true. |
||||
* |
||||
* If we find that the old page is no longer a regular index page (because |
||||
* of a revmap extension), the old buffer is unlocked and we return |
||||
* InvalidBuffer. |
||||
*/ |
||||
static Buffer |
||||
brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
||||
bool *was_extended) |
||||
{ |
||||
BlockNumber oldblk; |
||||
BlockNumber newblk; |
||||
Page page; |
||||
int freespace; |
||||
|
||||
if (BufferIsValid(oldbuf)) |
||||
oldblk = BufferGetBlockNumber(oldbuf); |
||||
else |
||||
oldblk = InvalidBlockNumber; |
||||
|
||||
/*
|
||||
* Loop until we find a page with sufficient free space. By the time we |
||||
* return to caller out of this loop, both buffers are valid and locked; |
||||
* if we have to restart here, neither buffer is locked and buf is not a |
||||
* pinned buffer. |
||||
*/ |
||||
newblk = RelationGetTargetBlock(irel); |
||||
if (newblk == InvalidBlockNumber) |
||||
newblk = GetPageWithFreeSpace(irel, itemsz); |
||||
for (;;) |
||||
{ |
||||
Buffer buf; |
||||
bool extensionLockHeld = false; |
||||
bool extended = false; |
||||
|
||||
CHECK_FOR_INTERRUPTS(); |
||||
|
||||
if (newblk == InvalidBlockNumber) |
||||
{ |
||||
/*
|
||||
* There's not enough free space in any existing index page, |
||||
* according to the FSM: extend the relation to obtain a shiny new |
||||
* page. |
||||
*/ |
||||
if (!RELATION_IS_LOCAL(irel)) |
||||
{ |
||||
LockRelationForExtension(irel, ExclusiveLock); |
||||
extensionLockHeld = true; |
||||
} |
||||
buf = ReadBuffer(irel, P_NEW); |
||||
newblk = BufferGetBlockNumber(buf); |
||||
*was_extended = extended = true; |
||||
|
||||
BRIN_elog(DEBUG2, "brin_getinsertbuffer: extending to page %u", |
||||
BufferGetBlockNumber(buf)); |
||||
} |
||||
else if (newblk == oldblk) |
||||
{ |
||||
/*
|
||||
* There's an odd corner-case here where the FSM is out-of-date, |
||||
* and gave us the old page. |
||||
*/ |
||||
buf = oldbuf; |
||||
} |
||||
else |
||||
{ |
||||
buf = ReadBuffer(irel, newblk); |
||||
} |
||||
|
||||
/*
|
||||
* We lock the old buffer first, if it's earlier than the new one; but |
||||
* before we do, we need to check that it hasn't been turned into a |
||||
* revmap page concurrently; if we detect that it happened, give up |
||||
* and tell caller to start over. |
||||
*/ |
||||
if (BufferIsValid(oldbuf) && oldblk < newblk) |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
||||
if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
ReleaseBuffer(buf); |
||||
return InvalidBuffer; |
||||
} |
||||
} |
||||
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
||||
|
||||
if (extensionLockHeld) |
||||
UnlockRelationForExtension(irel, ExclusiveLock); |
||||
|
||||
page = BufferGetPage(buf); |
||||
|
||||
if (extended) |
||||
brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
||||
|
||||
/*
|
||||
* We have a new buffer to insert into. Check that the new page has |
||||
* enough free space, and return it if it does; otherwise start over. |
||||
* Note that we allow for the FSM to be out of date here, and in that |
||||
* case we update it and move on. |
||||
* |
||||
* (br_page_get_freespace also checks that the FSM didn't hand us a |
||||
* page that has since been repurposed for the revmap.) |
||||
*/ |
||||
freespace = br_page_get_freespace(page); |
||||
if (freespace >= itemsz) |
||||
{ |
||||
RelationSetTargetBlock(irel, BufferGetBlockNumber(buf)); |
||||
|
||||
/*
|
||||
* Since the target block specification can get lost on cache |
||||
* invalidations, make sure we update the more permanent FSM with |
||||
* data about it before going away. |
||||
*/ |
||||
if (extended) |
||||
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), |
||||
freespace); |
||||
|
||||
/*
|
||||
* Lock the old buffer if not locked already. Note that in this |
||||
* case we know for sure it's a regular page: it's later than the |
||||
* new page we just got, which is not a revmap page, and revmap |
||||
* pages are always consecutive. |
||||
*/ |
||||
if (BufferIsValid(oldbuf) && oldblk > newblk) |
||||
{ |
||||
LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
||||
Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); |
||||
} |
||||
|
||||
return buf; |
||||
} |
||||
|
||||
/* This page is no good. */ |
||||
|
||||
/*
|
||||
* If an entirely new page does not contain enough free space for the |
||||
* new item, then surely that item is oversized. Complain loudly; but |
||||
* first make sure we record the page as free, for next time. |
||||
*/ |
||||
if (extended) |
||||
{ |
||||
RecordPageWithFreeSpace(irel, BufferGetBlockNumber(buf), |
||||
freespace); |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
||||
errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", |
||||
(unsigned long) itemsz, |
||||
(unsigned long) freespace, |
||||
RelationGetRelationName(irel)))); |
||||
return InvalidBuffer; /* keep compiler quiet */ |
||||
} |
||||
|
||||
if (newblk != oldblk) |
||||
UnlockReleaseBuffer(buf); |
||||
if (BufferIsValid(oldbuf) && oldblk <= newblk) |
||||
LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
||||
|
||||
newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Return the amount of free space on a regular BRIN index page. |
||||
* |
||||
* If the page is not a regular page, or has been marked with the |
||||
* BRIN_EVACUATE_PAGE flag, returns 0. |
||||
*/ |
||||
static Size |
||||
br_page_get_freespace(Page page) |
||||
{ |
||||
BrinSpecialSpace *special; |
||||
|
||||
special = (BrinSpecialSpace *) PageGetSpecialPointer(page); |
||||
if (!BRIN_IS_REGULAR_PAGE(page) || |
||||
(special->flags & BRIN_EVACUATE_PAGE) != 0) |
||||
return 0; |
||||
else |
||||
return PageGetFreeSpace(page); |
||||
} |
@ -0,0 +1,510 @@ |
||||
/*
|
||||
* brin_revmap.c |
||||
* Range map for BRIN indexes |
||||
* |
||||
* The range map (revmap) is a translation structure for BRIN indexes: for each |
||||
* page range there is one summary tuple, and its location is tracked by the |
||||
* revmap. Whenever a new tuple is inserted into a table that violates the |
||||
* previously recorded summary values, a new tuple is inserted into the index |
||||
* and the revmap is updated to point to it. |
||||
* |
||||
* The revmap is stored in the first pages of the index, immediately following |
||||
* the metapage. When the revmap needs to be expanded, all tuples on the |
||||
* regular BRIN page at that block (if any) are moved out of the way. |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/brin/brin_revmap.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/brin_page.h" |
||||
#include "access/brin_pageops.h" |
||||
#include "access/brin_revmap.h" |
||||
#include "access/brin_tuple.h" |
||||
#include "access/brin_xlog.h" |
||||
#include "access/rmgr.h" |
||||
#include "access/xloginsert.h" |
||||
#include "miscadmin.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/lmgr.h" |
||||
#include "utils/rel.h" |
||||
|
||||
|
||||
/*
|
||||
* In revmap pages, each item stores an ItemPointerData. These defines let one |
||||
* find the logical revmap page number and index number of the revmap item for |
||||
* the given heap block number. |
||||
*/ |
||||
#define HEAPBLK_TO_REVMAP_BLK(pagesPerRange, heapBlk) \ |
||||
((heapBlk / pagesPerRange) / REVMAP_PAGE_MAXITEMS) |
||||
#define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \ |
||||
((heapBlk / pagesPerRange) % REVMAP_PAGE_MAXITEMS) |
||||
|
||||
|
||||
struct BrinRevmap |
||||
{ |
||||
Relation rm_irel; |
||||
BlockNumber rm_pagesPerRange; |
||||
BlockNumber rm_lastRevmapPage; /* cached from the metapage */ |
||||
Buffer rm_metaBuf; |
||||
Buffer rm_currBuf; |
||||
}; |
||||
|
||||
/* typedef appears in brin_revmap.h */ |
||||
|
||||
|
||||
static BlockNumber revmap_get_blkno(BrinRevmap *revmap, |
||||
BlockNumber heapBlk); |
||||
static Buffer revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk); |
||||
static BlockNumber revmap_extend_and_get_blkno(BrinRevmap *revmap, |
||||
BlockNumber heapBlk); |
||||
static void revmap_physical_extend(BrinRevmap *revmap); |
||||
|
||||
/*
|
||||
* Initialize an access object for a range map. This must be freed by |
||||
* brinRevmapTerminate when caller is done with it. |
||||
*/ |
||||
BrinRevmap * |
||||
brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange) |
||||
{ |
||||
BrinRevmap *revmap; |
||||
Buffer meta; |
||||
BrinMetaPageData *metadata; |
||||
|
||||
meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); |
||||
LockBuffer(meta, BUFFER_LOCK_SHARE); |
||||
metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta)); |
||||
|
||||
revmap = palloc(sizeof(BrinRevmap)); |
||||
revmap->rm_irel = idxrel; |
||||
revmap->rm_pagesPerRange = metadata->pagesPerRange; |
||||
revmap->rm_lastRevmapPage = metadata->lastRevmapPage; |
||||
revmap->rm_metaBuf = meta; |
||||
revmap->rm_currBuf = InvalidBuffer; |
||||
|
||||
*pagesPerRange = metadata->pagesPerRange; |
||||
|
||||
LockBuffer(meta, BUFFER_LOCK_UNLOCK); |
||||
|
||||
return revmap; |
||||
} |
||||
|
||||
/*
|
||||
* Release resources associated with a revmap access object. |
||||
*/ |
||||
void |
||||
brinRevmapTerminate(BrinRevmap *revmap) |
||||
{ |
||||
ReleaseBuffer(revmap->rm_metaBuf); |
||||
if (revmap->rm_currBuf != InvalidBuffer) |
||||
ReleaseBuffer(revmap->rm_currBuf); |
||||
pfree(revmap); |
||||
} |
||||
|
||||
/*
|
||||
* Extend the revmap to cover the given heap block number. |
||||
*/ |
||||
void |
||||
brinRevmapExtend(BrinRevmap *revmap, BlockNumber heapBlk) |
||||
{ |
||||
BlockNumber mapBlk; |
||||
|
||||
mapBlk = revmap_extend_and_get_blkno(revmap, heapBlk); |
||||
|
||||
/* Ensure the buffer we got is in the expected range */ |
||||
Assert(mapBlk != InvalidBlockNumber && |
||||
mapBlk != BRIN_METAPAGE_BLKNO && |
||||
mapBlk <= revmap->rm_lastRevmapPage); |
||||
} |
||||
|
||||
/*
|
||||
* Prepare to insert an entry into the revmap; the revmap buffer in which the |
||||
* entry is to reside is locked and returned. Most callers should call |
||||
* brinRevmapExtend beforehand, as this routine does not extend the revmap if |
||||
* it's not long enough. |
||||
* |
||||
* The returned buffer is also recorded in the revmap struct; finishing that |
||||
* releases the buffer, therefore the caller needn't do it explicitely. |
||||
*/ |
||||
Buffer |
||||
brinLockRevmapPageForUpdate(BrinRevmap *revmap, BlockNumber heapBlk) |
||||
{ |
||||
Buffer rmBuf; |
||||
|
||||
rmBuf = revmap_get_buffer(revmap, heapBlk); |
||||
LockBuffer(rmBuf, BUFFER_LOCK_EXCLUSIVE); |
||||
|
||||
return rmBuf; |
||||
} |
||||
|
||||
/*
|
||||
* In the given revmap buffer (locked appropriately by caller), which is used |
||||
* in a BRIN index of pagesPerRange pages per range, set the element |
||||
* corresponding to heap block number heapBlk to the given TID. |
||||
* |
||||
* Once the operation is complete, the caller must update the LSN on the |
||||
* returned buffer. |
||||
* |
||||
* This is used both in regular operation and during WAL replay. |
||||
*/ |
||||
void |
||||
brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, |
||||
BlockNumber heapBlk, ItemPointerData tid) |
||||
{ |
||||
RevmapContents *contents; |
||||
ItemPointerData *iptr; |
||||
Page page; |
||||
|
||||
/* The correct page should already be pinned and locked */ |
||||
page = BufferGetPage(buf); |
||||
contents = (RevmapContents *) PageGetContents(page); |
||||
iptr = (ItemPointerData *) contents->rm_tids; |
||||
iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); |
||||
|
||||
ItemPointerSet(iptr, |
||||
ItemPointerGetBlockNumber(&tid), |
||||
ItemPointerGetOffsetNumber(&tid)); |
||||
} |
||||
|
||||
/*
|
||||
* Fetch the BrinTuple for a given heap block. |
||||
* |
||||
* The buffer containing the tuple is locked, and returned in *buf. As an |
||||
* optimization, the caller can pass a pinned buffer *buf on entry, which will |
||||
* avoid a pin-unpin cycle when the next tuple is on the same page as a |
||||
* previous one. |
||||
* |
||||
* If no tuple is found for the given heap range, returns NULL. In that case, |
||||
* *buf might still be updated, but it's not locked. |
||||
* |
||||
* The output tuple offset within the buffer is returned in *off, and its size |
||||
* is returned in *size. |
||||
*/ |
||||
BrinTuple * |
||||
brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, |
||||
Buffer *buf, OffsetNumber *off, Size *size, int mode) |
||||
{ |
||||
Relation idxRel = revmap->rm_irel; |
||||
BlockNumber mapBlk; |
||||
RevmapContents *contents; |
||||
ItemPointerData *iptr; |
||||
BlockNumber blk; |
||||
Page page; |
||||
ItemId lp; |
||||
BrinTuple *tup; |
||||
ItemPointerData previptr; |
||||
|
||||
/* normalize the heap block number to be the first page in the range */ |
||||
heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; |
||||
|
||||
/* Compute the revmap page number we need */ |
||||
mapBlk = revmap_get_blkno(revmap, heapBlk); |
||||
if (mapBlk == InvalidBlockNumber) |
||||
{ |
||||
*off = InvalidOffsetNumber; |
||||
return NULL; |
||||
} |
||||
|
||||
ItemPointerSetInvalid(&previptr); |
||||
for (;;) |
||||
{ |
||||
CHECK_FOR_INTERRUPTS(); |
||||
|
||||
if (revmap->rm_currBuf == InvalidBuffer || |
||||
BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) |
||||
{ |
||||
if (revmap->rm_currBuf != InvalidBuffer) |
||||
ReleaseBuffer(revmap->rm_currBuf); |
||||
|
||||
Assert(mapBlk != InvalidBlockNumber); |
||||
revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); |
||||
} |
||||
|
||||
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); |
||||
|
||||
contents = (RevmapContents *) |
||||
PageGetContents(BufferGetPage(revmap->rm_currBuf)); |
||||
iptr = contents->rm_tids; |
||||
iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); |
||||
|
||||
if (!ItemPointerIsValid(iptr)) |
||||
{ |
||||
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); |
||||
return NULL; |
||||
} |
||||
|
||||
/*
|
||||
* Check the TID we got in a previous iteration, if any, and save the |
||||
* current TID we got from the revmap; if we loop, we can sanity-check |
||||
* that the next one we get is different. Otherwise we might be stuck |
||||
* looping forever if the revmap is somehow badly broken. |
||||
*/ |
||||
if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INDEX_CORRUPTED), |
||||
errmsg_internal("corrupted BRIN index: inconsistent range map"))); |
||||
previptr = *iptr; |
||||
|
||||
blk = ItemPointerGetBlockNumber(iptr); |
||||
*off = ItemPointerGetOffsetNumber(iptr); |
||||
|
||||
LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); |
||||
|
||||
/* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ |
||||
if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) |
||||
{ |
||||
if (BufferIsValid(*buf)) |
||||
ReleaseBuffer(*buf); |
||||
*buf = ReadBuffer(idxRel, blk); |
||||
} |
||||
LockBuffer(*buf, mode); |
||||
page = BufferGetPage(*buf); |
||||
|
||||
/* If we land on a revmap page, start over */ |
||||
if (BRIN_IS_REGULAR_PAGE(page)) |
||||
{ |
||||
lp = PageGetItemId(page, *off); |
||||
if (ItemIdIsUsed(lp)) |
||||
{ |
||||
tup = (BrinTuple *) PageGetItem(page, lp); |
||||
|
||||
if (tup->bt_blkno == heapBlk) |
||||
{ |
||||
if (size) |
||||
*size = ItemIdGetLength(lp); |
||||
/* found it! */ |
||||
return tup; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* No luck. Assume that the revmap was updated concurrently. |
||||
*/ |
||||
LockBuffer(*buf, BUFFER_LOCK_UNLOCK); |
||||
} |
||||
/* not reached, but keep compiler quiet */ |
||||
return NULL; |
||||
} |
||||
|
||||
/*
|
||||
* Given a heap block number, find the corresponding physical revmap block |
||||
* number and return it. If the revmap page hasn't been allocated yet, return |
||||
* InvalidBlockNumber. |
||||
*/ |
||||
static BlockNumber |
||||
revmap_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) |
||||
{ |
||||
BlockNumber targetblk; |
||||
|
||||
/* obtain revmap block number, skip 1 for metapage block */ |
||||
targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; |
||||
|
||||
/* Normal case: the revmap page is already allocated */ |
||||
if (targetblk <= revmap->rm_lastRevmapPage) |
||||
return targetblk; |
||||
|
||||
return InvalidBlockNumber; |
||||
} |
||||
|
||||
/*
|
||||
* Obtain and return a buffer containing the revmap page for the given heap |
||||
* page. The revmap must have been previously extended to cover that page. |
||||
* The returned buffer is also recorded in the revmap struct; finishing that |
||||
* releases the buffer, therefore the caller needn't do it explicitely. |
||||
*/ |
||||
static Buffer |
||||
revmap_get_buffer(BrinRevmap *revmap, BlockNumber heapBlk) |
||||
{ |
||||
BlockNumber mapBlk; |
||||
|
||||
/* Translate the heap block number to physical index location. */ |
||||
mapBlk = revmap_get_blkno(revmap, heapBlk); |
||||
|
||||
if (mapBlk == InvalidBlockNumber) |
||||
elog(ERROR, "revmap does not cover heap block %u", heapBlk); |
||||
|
||||
/* Ensure the buffer we got is in the expected range */ |
||||
Assert(mapBlk != BRIN_METAPAGE_BLKNO && |
||||
mapBlk <= revmap->rm_lastRevmapPage); |
||||
|
||||
BRIN_elog(DEBUG2, "getting revmap page for logical page %lu (physical %u) for heap %u", |
||||
HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk), |
||||
mapBlk, heapBlk); |
||||
|
||||
/*
|
||||
* Obtain the buffer from which we need to read. If we already have the |
||||
* correct buffer in our access struct, use that; otherwise, release that, |
||||
* (if valid) and read the one we need. |
||||
*/ |
||||
if (revmap->rm_currBuf == InvalidBuffer || |
||||
mapBlk != BufferGetBlockNumber(revmap->rm_currBuf)) |
||||
{ |
||||
if (revmap->rm_currBuf != InvalidBuffer) |
||||
ReleaseBuffer(revmap->rm_currBuf); |
||||
|
||||
revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); |
||||
} |
||||
|
||||
return revmap->rm_currBuf; |
||||
} |
||||
|
||||
/*
|
||||
* Given a heap block number, find the corresponding physical revmap block |
||||
* number and return it. If the revmap page hasn't been allocated yet, extend |
||||
* the revmap until it is. |
||||
*/ |
||||
static BlockNumber |
||||
revmap_extend_and_get_blkno(BrinRevmap *revmap, BlockNumber heapBlk) |
||||
{ |
||||
BlockNumber targetblk; |
||||
|
||||
/* obtain revmap block number, skip 1 for metapage block */ |
||||
targetblk = HEAPBLK_TO_REVMAP_BLK(revmap->rm_pagesPerRange, heapBlk) + 1; |
||||
|
||||
/* Extend the revmap, if necessary */ |
||||
while (targetblk > revmap->rm_lastRevmapPage) |
||||
{ |
||||
CHECK_FOR_INTERRUPTS(); |
||||
revmap_physical_extend(revmap); |
||||
} |
||||
|
||||
return targetblk; |
||||
} |
||||
|
||||
/*
|
||||
* Try to extend the revmap by one page. This might not happen for a number of |
||||
* reasons; caller is expected to retry until the expected outcome is obtained. |
||||
*/ |
||||
static void |
||||
revmap_physical_extend(BrinRevmap *revmap) |
||||
{ |
||||
Buffer buf; |
||||
Page page; |
||||
Page metapage; |
||||
BrinMetaPageData *metadata; |
||||
BlockNumber mapBlk; |
||||
BlockNumber nblocks; |
||||
Relation irel = revmap->rm_irel; |
||||
bool needLock = !RELATION_IS_LOCAL(irel); |
||||
|
||||
/*
|
||||
* Lock the metapage. This locks out concurrent extensions of the revmap, |
||||
* but note that we still need to grab the relation extension lock because |
||||
* another backend can extend the index with regular BRIN pages. |
||||
*/ |
||||
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); |
||||
metapage = BufferGetPage(revmap->rm_metaBuf); |
||||
metadata = (BrinMetaPageData *) PageGetContents(metapage); |
||||
|
||||
/*
|
||||
* Check that our cached lastRevmapPage value was up-to-date; if it |
||||
* wasn't, update the cached copy and have caller start over. |
||||
*/ |
||||
if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) |
||||
{ |
||||
revmap->rm_lastRevmapPage = metadata->lastRevmapPage; |
||||
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); |
||||
return; |
||||
} |
||||
mapBlk = metadata->lastRevmapPage + 1; |
||||
|
||||
nblocks = RelationGetNumberOfBlocks(irel); |
||||
if (mapBlk < nblocks) |
||||
{ |
||||
buf = ReadBuffer(irel, mapBlk); |
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
||||
page = BufferGetPage(buf); |
||||
} |
||||
else |
||||
{ |
||||
if (needLock) |
||||
LockRelationForExtension(irel, ExclusiveLock); |
||||
|
||||
buf = ReadBuffer(irel, P_NEW); |
||||
if (BufferGetBlockNumber(buf) != mapBlk) |
||||
{ |
||||
/*
|
||||
* Very rare corner case: somebody extended the relation |
||||
* concurrently after we read its length. If this happens, give |
||||
* up and have caller start over. We will have to evacuate that |
||||
* page from under whoever is using it. |
||||
*/ |
||||
if (needLock) |
||||
UnlockRelationForExtension(irel, ExclusiveLock); |
||||
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); |
||||
return; |
||||
} |
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
||||
page = BufferGetPage(buf); |
||||
|
||||
if (needLock) |
||||
UnlockRelationForExtension(irel, ExclusiveLock); |
||||
} |
||||
|
||||
/* Check that it's a regular block (or an empty page) */ |
||||
if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INDEX_CORRUPTED), |
||||
errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", |
||||
BRIN_PAGE_TYPE(page), |
||||
RelationGetRelationName(irel), |
||||
BufferGetBlockNumber(buf)))); |
||||
|
||||
/* If the page is in use, evacuate it and restart */ |
||||
if (brin_start_evacuating_page(irel, buf)) |
||||
{ |
||||
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); |
||||
brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); |
||||
|
||||
/* have caller start over */ |
||||
return; |
||||
} |
||||
|
||||
/*
|
||||
* Ok, we have now locked the metapage and the target block. Re-initialize |
||||
* it as a revmap page. |
||||
*/ |
||||
START_CRIT_SECTION(); |
||||
|
||||
/* the rm_tids array is initialized to all invalid by PageInit */ |
||||
brin_page_init(page, BRIN_PAGETYPE_REVMAP); |
||||
MarkBufferDirty(buf); |
||||
|
||||
metadata->lastRevmapPage = mapBlk; |
||||
MarkBufferDirty(revmap->rm_metaBuf); |
||||
|
||||
if (RelationNeedsWAL(revmap->rm_irel)) |
||||
{ |
||||
xl_brin_revmap_extend xlrec; |
||||
XLogRecPtr recptr; |
||||
XLogRecData rdata[2]; |
||||
|
||||
xlrec.node = revmap->rm_irel->rd_node; |
||||
xlrec.targetBlk = mapBlk; |
||||
rdata[0].data = (char *) &xlrec; |
||||
rdata[0].len = SizeOfBrinRevmapExtend; |
||||
rdata[0].buffer = InvalidBuffer; |
||||
rdata[0].buffer_std = false; |
||||
rdata[0].next = &(rdata[1]); |
||||
|
||||
rdata[1].data = (char *) NULL; |
||||
rdata[1].len = 0; |
||||
rdata[1].buffer = revmap->rm_metaBuf; |
||||
rdata[1].buffer_std = false; |
||||
rdata[1].next = NULL; |
||||
|
||||
recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata); |
||||
PageSetLSN(metapage, recptr); |
||||
PageSetLSN(page, recptr); |
||||
} |
||||
|
||||
END_CRIT_SECTION(); |
||||
|
||||
LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); |
||||
|
||||
UnlockReleaseBuffer(buf); |
||||
} |
@ -0,0 +1,554 @@ |
||||
/*
|
||||
* brin_tuples.c |
||||
* Method implementations for tuples in BRIN indexes. |
||||
* |
||||
* Intended usage is that code outside this file only deals with |
||||
* BrinMemTuples, and convert to and from the on-disk representation through |
||||
* functions in this file. |
||||
* |
||||
* NOTES |
||||
* |
||||
* A BRIN tuple is similar to a heap tuple, with a few key differences. The |
||||
* first interesting difference is that the tuple header is much simpler, only |
||||
* containing its total length and a small area for flags. Also, the stored |
||||
* data does not match the relation tuple descriptor exactly: for each |
||||
* attribute in the descriptor, the index tuple carries an arbitrary number |
||||
* of values, depending on the opclass. |
||||
* |
||||
* Also, for each column of the index relation there are two null bits: one |
||||
* (hasnulls) stores whether any tuple within the page range has that column |
||||
* set to null; the other one (allnulls) stores whether the column values are |
||||
* all null. If allnulls is true, then the tuple data area does not contain |
||||
* values for that column at all; whereas it does if the hasnulls is set. |
||||
* Note the size of the null bitmask may not be the same as that of the |
||||
* datum array. |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/brin/brin_tuple.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/htup_details.h" |
||||
#include "access/brin_tuple.h" |
||||
#include "access/tupdesc.h" |
||||
#include "access/tupmacs.h" |
||||
#include "utils/datum.h" |
||||
#include "utils/memutils.h" |
||||
|
||||
|
||||
static inline void brin_deconstruct_tuple(BrinDesc *brdesc, |
||||
char *tp, bits8 *nullbits, bool nulls, |
||||
Datum *values, bool *allnulls, bool *hasnulls); |
||||
|
||||
|
||||
/*
|
||||
* Return a tuple descriptor used for on-disk storage of BRIN tuples. |
||||
*/ |
||||
static TupleDesc |
||||
brtuple_disk_tupdesc(BrinDesc *brdesc) |
||||
{ |
||||
/* We cache these in the BrinDesc */ |
||||
if (brdesc->bd_disktdesc == NULL) |
||||
{ |
||||
int i; |
||||
int j; |
||||
AttrNumber attno = 1; |
||||
TupleDesc tupdesc; |
||||
MemoryContext oldcxt; |
||||
|
||||
/* make sure it's in the bdesc's context */ |
||||
oldcxt = MemoryContextSwitchTo(brdesc->bd_context); |
||||
|
||||
tupdesc = CreateTemplateTupleDesc(brdesc->bd_totalstored, false); |
||||
|
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++) |
||||
{ |
||||
for (j = 0; j < brdesc->bd_info[i]->oi_nstored; j++) |
||||
TupleDescInitEntry(tupdesc, attno++, NULL, |
||||
brdesc->bd_info[i]->oi_typids[j], |
||||
-1, 0); |
||||
} |
||||
|
||||
MemoryContextSwitchTo(oldcxt); |
||||
|
||||
brdesc->bd_disktdesc = tupdesc; |
||||
} |
||||
|
||||
return brdesc->bd_disktdesc; |
||||
} |
||||
|
||||
/*
|
||||
* Generate a new on-disk tuple to be inserted in a BRIN index. |
||||
* |
||||
* See brin_form_placeholder_tuple if you touch this. |
||||
*/ |
||||
BrinTuple * |
||||
brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, |
||||
Size *size) |
||||
{ |
||||
Datum *values; |
||||
bool *nulls; |
||||
bool anynulls = false; |
||||
BrinTuple *rettuple; |
||||
int keyno; |
||||
int idxattno; |
||||
uint16 phony_infomask; |
||||
bits8 *phony_nullbitmap; |
||||
Size len, |
||||
hoff, |
||||
data_len; |
||||
|
||||
Assert(brdesc->bd_totalstored > 0); |
||||
|
||||
values = palloc(sizeof(Datum) * brdesc->bd_totalstored); |
||||
nulls = palloc0(sizeof(bool) * brdesc->bd_totalstored); |
||||
phony_nullbitmap = palloc(sizeof(bits8) * BITMAPLEN(brdesc->bd_totalstored)); |
||||
|
||||
/*
|
||||
* Set up the values/nulls arrays for heap_fill_tuple |
||||
*/ |
||||
idxattno = 0; |
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) |
||||
{ |
||||
int datumno; |
||||
|
||||
/*
|
||||
* "allnulls" is set when there's no nonnull value in any row in the |
||||
* column; when this happens, there is no data to store. Thus set the |
||||
* nullable bits for all data elements of this column and we're done. |
||||
*/ |
||||
if (tuple->bt_columns[keyno].bv_allnulls) |
||||
{ |
||||
for (datumno = 0; |
||||
datumno < brdesc->bd_info[keyno]->oi_nstored; |
||||
datumno++) |
||||
nulls[idxattno++] = true; |
||||
anynulls = true; |
||||
continue; |
||||
} |
||||
|
||||
/*
|
||||
* The "hasnulls" bit is set when there are some null values in the |
||||
* data. We still need to store a real value, but the presence of |
||||
* this means we need a null bitmap. |
||||
*/ |
||||
if (tuple->bt_columns[keyno].bv_hasnulls) |
||||
anynulls = true; |
||||
|
||||
for (datumno = 0; |
||||
datumno < brdesc->bd_info[keyno]->oi_nstored; |
||||
datumno++) |
||||
values[idxattno++] = tuple->bt_columns[keyno].bv_values[datumno]; |
||||
} |
||||
|
||||
/* compute total space needed */ |
||||
len = SizeOfBrinTuple; |
||||
if (anynulls) |
||||
{ |
||||
/*
|
||||
* We need a double-length bitmap on an on-disk BRIN index tuple; the |
||||
* first half stores the "allnulls" bits, the second stores |
||||
* "hasnulls". |
||||
*/ |
||||
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); |
||||
} |
||||
|
||||
len = hoff = MAXALIGN(len); |
||||
|
||||
data_len = heap_compute_data_size(brtuple_disk_tupdesc(brdesc), |
||||
values, nulls); |
||||
|
||||
len += data_len; |
||||
|
||||
rettuple = palloc0(len); |
||||
rettuple->bt_blkno = blkno; |
||||
rettuple->bt_info = hoff; |
||||
Assert((rettuple->bt_info & BRIN_OFFSET_MASK) == hoff); |
||||
|
||||
/*
|
||||
* The infomask and null bitmap as computed by heap_fill_tuple are useless |
||||
* to us. However, that function will not accept a null infomask; and we |
||||
* need to pass a valid null bitmap so that it will correctly skip |
||||
* outputting null attributes in the data area. |
||||
*/ |
||||
heap_fill_tuple(brtuple_disk_tupdesc(brdesc), |
||||
values, |
||||
nulls, |
||||
(char *) rettuple + hoff, |
||||
data_len, |
||||
&phony_infomask, |
||||
phony_nullbitmap); |
||||
|
||||
/* done with these */ |
||||
pfree(values); |
||||
pfree(nulls); |
||||
pfree(phony_nullbitmap); |
||||
|
||||
/*
|
||||
* Now fill in the real null bitmasks. allnulls first. |
||||
*/ |
||||
if (anynulls) |
||||
{ |
||||
bits8 *bitP; |
||||
int bitmask; |
||||
|
||||
rettuple->bt_info |= BRIN_NULLS_MASK; |
||||
|
||||
/*
|
||||
* Note that we reverse the sense of null bits in this module: we |
||||
* store a 1 for a null attribute rather than a 0. So we must reverse |
||||
* the sense of the att_isnull test in br_deconstruct_tuple as well. |
||||
*/ |
||||
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; |
||||
bitmask = HIGHBIT; |
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) |
||||
{ |
||||
if (bitmask != HIGHBIT) |
||||
bitmask <<= 1; |
||||
else |
||||
{ |
||||
bitP += 1; |
||||
*bitP = 0x0; |
||||
bitmask = 1; |
||||
} |
||||
|
||||
if (!tuple->bt_columns[keyno].bv_allnulls) |
||||
continue; |
||||
|
||||
*bitP |= bitmask; |
||||
} |
||||
/* hasnulls bits follow */ |
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) |
||||
{ |
||||
if (bitmask != HIGHBIT) |
||||
bitmask <<= 1; |
||||
else |
||||
{ |
||||
bitP += 1; |
||||
*bitP = 0x0; |
||||
bitmask = 1; |
||||
} |
||||
|
||||
if (!tuple->bt_columns[keyno].bv_hasnulls) |
||||
continue; |
||||
|
||||
*bitP |= bitmask; |
||||
} |
||||
bitP = ((bits8 *) (rettuple + SizeOfBrinTuple)) - 1; |
||||
} |
||||
|
||||
if (tuple->bt_placeholder) |
||||
rettuple->bt_info |= BRIN_PLACEHOLDER_MASK; |
||||
|
||||
*size = len; |
||||
return rettuple; |
||||
} |
||||
|
||||
/*
|
||||
* Generate a new on-disk tuple with no data values, marked as placeholder. |
||||
* |
||||
* This is a cut-down version of brin_form_tuple. |
||||
*/ |
||||
BrinTuple * |
||||
brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size) |
||||
{ |
||||
Size len; |
||||
Size hoff; |
||||
BrinTuple *rettuple; |
||||
int keyno; |
||||
bits8 *bitP; |
||||
int bitmask; |
||||
|
||||
/* compute total space needed: always add nulls */ |
||||
len = SizeOfBrinTuple; |
||||
len += BITMAPLEN(brdesc->bd_tupdesc->natts * 2); |
||||
len = hoff = MAXALIGN(len); |
||||
|
||||
rettuple = palloc0(len); |
||||
rettuple->bt_blkno = blkno; |
||||
rettuple->bt_info = hoff; |
||||
rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK; |
||||
|
||||
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1; |
||||
bitmask = HIGHBIT; |
||||
/* set allnulls true for all attributes */ |
||||
for (keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) |
||||
{ |
||||
if (bitmask != HIGHBIT) |
||||
bitmask <<= 1; |
||||
else |
||||
{ |
||||
bitP += 1; |
||||
*bitP = 0x0; |
||||
bitmask = 1; |
||||
} |
||||
|
||||
*bitP |= bitmask; |
||||
} |
||||
/* no need to set hasnulls */ |
||||
|
||||
*size = len; |
||||
return rettuple; |
||||
} |
||||
|
||||
/*
|
||||
* Free a tuple created by brin_form_tuple |
||||
*/ |
||||
void |
||||
brin_free_tuple(BrinTuple *tuple) |
||||
{ |
||||
pfree(tuple); |
||||
} |
||||
|
||||
/*
|
||||
* Create an palloc'd copy of a BrinTuple. |
||||
*/ |
||||
BrinTuple * |
||||
brin_copy_tuple(BrinTuple *tuple, Size len) |
||||
{ |
||||
BrinTuple *newtup; |
||||
|
||||
newtup = palloc(len); |
||||
memcpy(newtup, tuple, len); |
||||
|
||||
return newtup; |
||||
} |
||||
|
||||
/*
|
||||
* Return whether two BrinTuples are bitwise identical. |
||||
*/ |
||||
bool |
||||
brin_tuples_equal(const BrinTuple *a, Size alen, const BrinTuple *b, Size blen) |
||||
{ |
||||
if (alen != blen) |
||||
return false; |
||||
if (memcmp(a, b, alen) != 0) |
||||
return false; |
||||
return true; |
||||
} |
||||
|
||||
/*
|
||||
* Create a new BrinMemTuple from scratch, and initialize it to an empty |
||||
* state. |
||||
* |
||||
* Note: we don't provide any means to free a deformed tuple, so make sure to |
||||
* use a temporary memory context. |
||||
*/ |
||||
BrinMemTuple * |
||||
brin_new_memtuple(BrinDesc *brdesc) |
||||
{ |
||||
BrinMemTuple *dtup; |
||||
char *currdatum; |
||||
long basesize; |
||||
int i; |
||||
|
||||
basesize = MAXALIGN(sizeof(BrinMemTuple) + |
||||
sizeof(BrinValues) * brdesc->bd_tupdesc->natts); |
||||
dtup = palloc0(basesize + sizeof(Datum) * brdesc->bd_totalstored); |
||||
currdatum = (char *) dtup + basesize; |
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++) |
||||
{ |
||||
dtup->bt_columns[i].bv_attno = i + 1; |
||||
dtup->bt_columns[i].bv_allnulls = true; |
||||
dtup->bt_columns[i].bv_hasnulls = false; |
||||
dtup->bt_columns[i].bv_values = (Datum *) currdatum; |
||||
currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored; |
||||
} |
||||
|
||||
dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext, |
||||
"brin dtuple", |
||||
ALLOCSET_DEFAULT_MINSIZE, |
||||
ALLOCSET_DEFAULT_INITSIZE, |
||||
ALLOCSET_DEFAULT_MAXSIZE); |
||||
return dtup; |
||||
} |
||||
|
||||
/*
|
||||
* Reset a BrinMemTuple to initial state |
||||
*/ |
||||
void |
||||
brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) |
||||
{ |
||||
int i; |
||||
|
||||
MemoryContextReset(dtuple->bt_context); |
||||
for (i = 0; i < brdesc->bd_tupdesc->natts; i++) |
||||
{ |
||||
dtuple->bt_columns[i].bv_allnulls = true; |
||||
dtuple->bt_columns[i].bv_hasnulls = false; |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Convert a BrinTuple back to a BrinMemTuple. This is the reverse of |
||||
* brin_form_tuple. |
||||
* |
||||
* Note we don't need the "on disk tupdesc" here; we rely on our own routine to |
||||
* deconstruct the tuple from the on-disk format. |
||||
*/ |
||||
BrinMemTuple * |
||||
brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple) |
||||
{ |
||||
BrinMemTuple *dtup; |
||||
Datum *values; |
||||
bool *allnulls; |
||||
bool *hasnulls; |
||||
char *tp; |
||||
bits8 *nullbits; |
||||
int keyno; |
||||
int valueno; |
||||
MemoryContext oldcxt; |
||||
|
||||
dtup = brin_new_memtuple(brdesc); |
||||
|
||||
if (BrinTupleIsPlaceholder(tuple)) |
||||
dtup->bt_placeholder = true; |
||||
dtup->bt_blkno = tuple->bt_blkno; |
||||
|
||||
values = palloc(sizeof(Datum) * brdesc->bd_totalstored); |
||||
allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); |
||||
hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts); |
||||
|
||||
tp = (char *) tuple + BrinTupleDataOffset(tuple); |
||||
|
||||
if (BrinTupleHasNulls(tuple)) |
||||
nullbits = (bits8 *) ((char *) tuple + SizeOfBrinTuple); |
||||
else |
||||
nullbits = NULL; |
||||
brin_deconstruct_tuple(brdesc, |
||||
tp, nullbits, BrinTupleHasNulls(tuple), |
||||
values, allnulls, hasnulls); |
||||
|
||||
/*
|
||||
* Iterate to assign each of the values to the corresponding item in the |
||||
* values array of each column. The copies occur in the tuple's context. |
||||
*/ |
||||
oldcxt = MemoryContextSwitchTo(dtup->bt_context); |
||||
for (valueno = 0, keyno = 0; keyno < brdesc->bd_tupdesc->natts; keyno++) |
||||
{ |
||||
int i; |
||||
|
||||
if (allnulls[keyno]) |
||||
{ |
||||
valueno += brdesc->bd_info[keyno]->oi_nstored; |
||||
continue; |
||||
} |
||||
|
||||
/*
|
||||
* We would like to skip datumCopy'ing the values datum in some cases, |
||||
* caller permitting ... |
||||
*/ |
||||
for (i = 0; i < brdesc->bd_info[keyno]->oi_nstored; i++) |
||||
dtup->bt_columns[keyno].bv_values[i] = |
||||
datumCopy(values[valueno++], |
||||
brdesc->bd_tupdesc->attrs[keyno]->attbyval, |
||||
brdesc->bd_tupdesc->attrs[keyno]->attlen); |
||||
|
||||
dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; |
||||
dtup->bt_columns[keyno].bv_allnulls = false; |
||||
} |
||||
|
||||
MemoryContextSwitchTo(oldcxt); |
||||
|
||||
pfree(values); |
||||
pfree(allnulls); |
||||
pfree(hasnulls); |
||||
|
||||
return dtup; |
||||
} |
||||
|
||||
/*
|
||||
* brin_deconstruct_tuple |
||||
* Guts of attribute extraction from an on-disk BRIN tuple. |
||||
* |
||||
* Its arguments are: |
||||
* brdesc BRIN descriptor for the stored tuple |
||||
* tp pointer to the tuple data area |
||||
* nullbits pointer to the tuple nulls bitmask |
||||
* nulls "has nulls" bit in tuple infomask |
||||
* values output values, array of size brdesc->bd_totalstored |
||||
* allnulls output "allnulls", size brdesc->bd_tupdesc->natts |
||||
* hasnulls output "hasnulls", size brdesc->bd_tupdesc->natts |
||||
* |
||||
* Output arrays must have been allocated by caller. |
||||
*/ |
||||
static inline void |
||||
brin_deconstruct_tuple(BrinDesc *brdesc, |
||||
char *tp, bits8 *nullbits, bool nulls, |
||||
Datum *values, bool *allnulls, bool *hasnulls) |
||||
{ |
||||
int attnum; |
||||
int stored; |
||||
TupleDesc diskdsc; |
||||
long off; |
||||
|
||||
/*
|
||||
* First iterate to natts to obtain both null flags for each attribute. |
||||
* Note that we reverse the sense of the att_isnull test, because we store |
||||
* 1 for a null value (rather than a 1 for a not null value as is the |
||||
* att_isnull convention used elsewhere.) See brin_form_tuple. |
||||
*/ |
||||
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) |
||||
{ |
||||
/*
|
||||
* the "all nulls" bit means that all values in the page range for |
||||
* this column are nulls. Therefore there are no values in the tuple |
||||
* data area. |
||||
*/ |
||||
allnulls[attnum] = nulls && !att_isnull(attnum, nullbits); |
||||
|
||||
/*
|
||||
* the "has nulls" bit means that some tuples have nulls, but others |
||||
* have not-null values. Therefore we know the tuple contains data |
||||
* for this column. |
||||
* |
||||
* The hasnulls bits follow the allnulls bits in the same bitmask. |
||||
*/ |
||||
hasnulls[attnum] = |
||||
nulls && !att_isnull(brdesc->bd_tupdesc->natts + attnum, nullbits); |
||||
} |
||||
|
||||
/*
|
||||
* Iterate to obtain each attribute's stored values. Note that since we |
||||
* may reuse attribute entries for more than one column, we cannot cache |
||||
* offsets here. |
||||
*/ |
||||
diskdsc = brtuple_disk_tupdesc(brdesc); |
||||
stored = 0; |
||||
off = 0; |
||||
for (attnum = 0; attnum < brdesc->bd_tupdesc->natts; attnum++) |
||||
{ |
||||
int datumno; |
||||
|
||||
if (allnulls[attnum]) |
||||
{ |
||||
stored += brdesc->bd_info[attnum]->oi_nstored; |
||||
continue; |
||||
} |
||||
|
||||
for (datumno = 0; |
||||
datumno < brdesc->bd_info[attnum]->oi_nstored; |
||||
datumno++) |
||||
{ |
||||
Form_pg_attribute thisatt = diskdsc->attrs[stored]; |
||||
|
||||
if (thisatt->attlen == -1) |
||||
{ |
||||
off = att_align_pointer(off, thisatt->attalign, -1, |
||||
tp + off); |
||||
} |
||||
else |
||||
{ |
||||
/* not varlena, so safe to use att_align_nominal */ |
||||
off = att_align_nominal(off, thisatt->attalign); |
||||
} |
||||
|
||||
values[stored++] = fetchatt(thisatt, tp + off); |
||||
|
||||
off = att_addlength_pointer(off, thisatt->attlen, tp + off); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,291 @@ |
||||
/*
|
||||
* brin_xlog.c |
||||
* XLog replay routines for BRIN indexes |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/brin/brin_xlog.c |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/brin_page.h" |
||||
#include "access/brin_pageops.h" |
||||
#include "access/brin_xlog.h" |
||||
#include "access/xlogutils.h" |
||||
|
||||
|
||||
/*
|
||||
* xlog replay routines |
||||
*/ |
||||
static void |
||||
brin_xlog_createidx(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
xl_brin_createidx *xlrec = (xl_brin_createidx *) XLogRecGetData(record); |
||||
Buffer buf; |
||||
Page page; |
||||
|
||||
/* Backup blocks are not used in create_index records */ |
||||
Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); |
||||
|
||||
/* create the index' metapage */ |
||||
buf = XLogReadBuffer(xlrec->node, BRIN_METAPAGE_BLKNO, true); |
||||
Assert(BufferIsValid(buf)); |
||||
page = (Page) BufferGetPage(buf); |
||||
brin_metapage_init(page, xlrec->pagesPerRange, xlrec->version); |
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buf); |
||||
UnlockReleaseBuffer(buf); |
||||
} |
||||
|
||||
/*
|
||||
* Common part of an insert or update. Inserts the new tuple and updates the |
||||
* revmap. |
||||
*/ |
||||
static void |
||||
brin_xlog_insert_update(XLogRecPtr lsn, XLogRecord *record, |
||||
xl_brin_insert *xlrec, BrinTuple *tuple) |
||||
{ |
||||
BlockNumber blkno; |
||||
Buffer buffer; |
||||
Page page; |
||||
XLogRedoAction action; |
||||
|
||||
blkno = ItemPointerGetBlockNumber(&xlrec->tid); |
||||
|
||||
/*
|
||||
* If we inserted the first and only tuple on the page, re-initialize the |
||||
* page from scratch. |
||||
*/ |
||||
if (record->xl_info & XLOG_BRIN_INIT_PAGE) |
||||
{ |
||||
XLogReadBufferForRedoExtended(lsn, record, 0, |
||||
xlrec->node, MAIN_FORKNUM, blkno, |
||||
RBM_ZERO, false, &buffer); |
||||
page = BufferGetPage(buffer); |
||||
brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
||||
action = BLK_NEEDS_REDO; |
||||
} |
||||
else |
||||
{ |
||||
action = XLogReadBufferForRedo(lsn, record, 0, |
||||
xlrec->node, blkno, &buffer); |
||||
} |
||||
|
||||
/* insert the index item into the page */ |
||||
if (action == BLK_NEEDS_REDO) |
||||
{ |
||||
OffsetNumber offnum; |
||||
|
||||
Assert(tuple->bt_blkno == xlrec->heapBlk); |
||||
|
||||
page = (Page) BufferGetPage(buffer); |
||||
offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); |
||||
if (PageGetMaxOffsetNumber(page) + 1 < offnum) |
||||
elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); |
||||
|
||||
offnum = PageAddItem(page, (Item) tuple, xlrec->tuplen, offnum, true, |
||||
false); |
||||
if (offnum == InvalidOffsetNumber) |
||||
elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); |
||||
|
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buffer); |
||||
} |
||||
if (BufferIsValid(buffer)) |
||||
UnlockReleaseBuffer(buffer); |
||||
|
||||
/* update the revmap */ |
||||
action = XLogReadBufferForRedo(lsn, record, 1, xlrec->node, |
||||
xlrec->revmapBlk, &buffer); |
||||
if (action == BLK_NEEDS_REDO) |
||||
{ |
||||
page = (Page) BufferGetPage(buffer); |
||||
|
||||
brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, |
||||
xlrec->tid); |
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buffer); |
||||
} |
||||
if (BufferIsValid(buffer)) |
||||
UnlockReleaseBuffer(buffer); |
||||
|
||||
/* XXX no FSM updates here ... */ |
||||
} |
||||
|
||||
/*
|
||||
* replay a BRIN index insertion |
||||
*/ |
||||
static void |
||||
brin_xlog_insert(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
xl_brin_insert *xlrec = (xl_brin_insert *) XLogRecGetData(record); |
||||
BrinTuple *newtup; |
||||
|
||||
newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinInsert); |
||||
|
||||
brin_xlog_insert_update(lsn, record, xlrec, newtup); |
||||
} |
||||
|
||||
/*
|
||||
* replay a BRIN index update |
||||
*/ |
||||
static void |
||||
brin_xlog_update(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); |
||||
BlockNumber blkno; |
||||
Buffer buffer; |
||||
BrinTuple *newtup; |
||||
XLogRedoAction action; |
||||
|
||||
newtup = (BrinTuple *) ((char *) xlrec + SizeOfBrinUpdate); |
||||
|
||||
/* First remove the old tuple */ |
||||
blkno = ItemPointerGetBlockNumber(&(xlrec->oldtid)); |
||||
action = XLogReadBufferForRedo(lsn, record, 2, xlrec->new.node, |
||||
blkno, &buffer); |
||||
if (action == BLK_NEEDS_REDO) |
||||
{ |
||||
Page page; |
||||
OffsetNumber offnum; |
||||
|
||||
page = (Page) BufferGetPage(buffer); |
||||
|
||||
offnum = ItemPointerGetOffsetNumber(&(xlrec->oldtid)); |
||||
if (PageGetMaxOffsetNumber(page) + 1 < offnum) |
||||
elog(PANIC, "brin_xlog_update: invalid max offset number"); |
||||
|
||||
PageIndexDeleteNoCompact(page, &offnum, 1); |
||||
|
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buffer); |
||||
} |
||||
|
||||
/* Then insert the new tuple and update revmap, like in an insertion. */ |
||||
brin_xlog_insert_update(lsn, record, &xlrec->new, newtup); |
||||
|
||||
if (BufferIsValid(buffer)) |
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
|
||||
/*
|
||||
* Update a tuple on a single page. |
||||
*/ |
||||
static void |
||||
brin_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
xl_brin_samepage_update *xlrec; |
||||
BlockNumber blkno; |
||||
Buffer buffer; |
||||
XLogRedoAction action; |
||||
|
||||
xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); |
||||
blkno = ItemPointerGetBlockNumber(&(xlrec->tid)); |
||||
action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, blkno, |
||||
&buffer); |
||||
if (action == BLK_NEEDS_REDO) |
||||
{ |
||||
int tuplen; |
||||
BrinTuple *mmtuple; |
||||
Page page; |
||||
OffsetNumber offnum; |
||||
|
||||
tuplen = record->xl_len - SizeOfBrinSamepageUpdate; |
||||
mmtuple = (BrinTuple *) ((char *) xlrec + SizeOfBrinSamepageUpdate); |
||||
|
||||
page = (Page) BufferGetPage(buffer); |
||||
|
||||
offnum = ItemPointerGetOffsetNumber(&(xlrec->tid)); |
||||
if (PageGetMaxOffsetNumber(page) + 1 < offnum) |
||||
elog(PANIC, "brin_xlog_samepage_update: invalid max offset number"); |
||||
|
||||
PageIndexDeleteNoCompact(page, &offnum, 1); |
||||
offnum = PageAddItem(page, (Item) mmtuple, tuplen, offnum, true, false); |
||||
if (offnum == InvalidOffsetNumber) |
||||
elog(PANIC, "brin_xlog_samepage_update: failed to add tuple"); |
||||
|
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buffer); |
||||
} |
||||
if (BufferIsValid(buffer)) |
||||
UnlockReleaseBuffer(buffer); |
||||
|
||||
/* XXX no FSM updates here ... */ |
||||
} |
||||
|
||||
/*
|
||||
* Replay a revmap page extension |
||||
*/ |
||||
static void |
||||
brin_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
xl_brin_revmap_extend *xlrec; |
||||
Buffer metabuf; |
||||
Buffer buf; |
||||
Page page; |
||||
XLogRedoAction action; |
||||
|
||||
xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); |
||||
/* Update the metapage */ |
||||
action = XLogReadBufferForRedo(lsn, record, 0, xlrec->node, |
||||
BRIN_METAPAGE_BLKNO, &metabuf); |
||||
if (action == BLK_NEEDS_REDO) |
||||
{ |
||||
Page metapg; |
||||
BrinMetaPageData *metadata; |
||||
|
||||
metapg = BufferGetPage(metabuf); |
||||
metadata = (BrinMetaPageData *) PageGetContents(metapg); |
||||
|
||||
Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); |
||||
metadata->lastRevmapPage = xlrec->targetBlk; |
||||
|
||||
PageSetLSN(metapg, lsn); |
||||
MarkBufferDirty(metabuf); |
||||
} |
||||
|
||||
/*
|
||||
* Re-init the target block as a revmap page. There's never a full- page |
||||
* image here. |
||||
*/ |
||||
|
||||
buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true); |
||||
page = (Page) BufferGetPage(buf); |
||||
brin_page_init(page, BRIN_PAGETYPE_REVMAP); |
||||
|
||||
PageSetLSN(page, lsn); |
||||
MarkBufferDirty(buf); |
||||
|
||||
UnlockReleaseBuffer(buf); |
||||
if (BufferIsValid(metabuf)) |
||||
UnlockReleaseBuffer(metabuf); |
||||
} |
||||
|
||||
void |
||||
brin_redo(XLogRecPtr lsn, XLogRecord *record) |
||||
{ |
||||
uint8 info = record->xl_info & ~XLR_INFO_MASK; |
||||
|
||||
switch (info & XLOG_BRIN_OPMASK) |
||||
{ |
||||
case XLOG_BRIN_CREATE_INDEX: |
||||
brin_xlog_createidx(lsn, record); |
||||
break; |
||||
case XLOG_BRIN_INSERT: |
||||
brin_xlog_insert(lsn, record); |
||||
break; |
||||
case XLOG_BRIN_UPDATE: |
||||
brin_xlog_update(lsn, record); |
||||
break; |
||||
case XLOG_BRIN_SAMEPAGE_UPDATE: |
||||
brin_xlog_samepage_update(lsn, record); |
||||
break; |
||||
case XLOG_BRIN_REVMAP_EXTEND: |
||||
brin_xlog_revmap_extend(lsn, record); |
||||
break; |
||||
default: |
||||
elog(PANIC, "brin_redo: unknown op code %u", info); |
||||
} |
||||
} |
@ -0,0 +1,112 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* brindesc.c |
||||
* rmgr descriptor routines for BRIN indexes |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/access/rmgrdesc/brindesc.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/brin_xlog.h" |
||||
|
||||
void |
||||
brin_desc(StringInfo buf, XLogRecord *record) |
||||
{ |
||||
char *rec = XLogRecGetData(record); |
||||
uint8 info = record->xl_info & ~XLR_INFO_MASK; |
||||
|
||||
info &= XLOG_BRIN_OPMASK; |
||||
if (info == XLOG_BRIN_CREATE_INDEX) |
||||
{ |
||||
xl_brin_createidx *xlrec = (xl_brin_createidx *) rec; |
||||
|
||||
appendStringInfo(buf, "v%d pagesPerRange %u rel %u/%u/%u", |
||||
xlrec->version, xlrec->pagesPerRange, |
||||
xlrec->node.spcNode, xlrec->node.dbNode, |
||||
xlrec->node.relNode); |
||||
} |
||||
else if (info == XLOG_BRIN_INSERT) |
||||
{ |
||||
xl_brin_insert *xlrec = (xl_brin_insert *) rec; |
||||
|
||||
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u TID (%u,%u)", |
||||
xlrec->node.spcNode, xlrec->node.dbNode, |
||||
xlrec->node.relNode, |
||||
xlrec->heapBlk, xlrec->revmapBlk, |
||||
xlrec->pagesPerRange, |
||||
ItemPointerGetBlockNumber(&xlrec->tid), |
||||
ItemPointerGetOffsetNumber(&xlrec->tid)); |
||||
} |
||||
else if (info == XLOG_BRIN_UPDATE) |
||||
{ |
||||
xl_brin_update *xlrec = (xl_brin_update *) rec; |
||||
|
||||
appendStringInfo(buf, "rel %u/%u/%u heapBlk %u revmapBlk %u pagesPerRange %u old TID (%u,%u) TID (%u,%u)", |
||||
xlrec->new.node.spcNode, xlrec->new.node.dbNode, |
||||
xlrec->new.node.relNode, |
||||
xlrec->new.heapBlk, xlrec->new.revmapBlk, |
||||
xlrec->new.pagesPerRange, |
||||
ItemPointerGetBlockNumber(&xlrec->oldtid), |
||||
ItemPointerGetOffsetNumber(&xlrec->oldtid), |
||||
ItemPointerGetBlockNumber(&xlrec->new.tid), |
||||
ItemPointerGetOffsetNumber(&xlrec->new.tid)); |
||||
} |
||||
else if (info == XLOG_BRIN_SAMEPAGE_UPDATE) |
||||
{ |
||||
xl_brin_samepage_update *xlrec = (xl_brin_samepage_update *) rec; |
||||
|
||||
appendStringInfo(buf, "rel %u/%u/%u TID (%u,%u)", |
||||
xlrec->node.spcNode, xlrec->node.dbNode, |
||||
xlrec->node.relNode, |
||||
ItemPointerGetBlockNumber(&xlrec->tid), |
||||
ItemPointerGetOffsetNumber(&xlrec->tid)); |
||||
} |
||||
else if (info == XLOG_BRIN_REVMAP_EXTEND) |
||||
{ |
||||
xl_brin_revmap_extend *xlrec = (xl_brin_revmap_extend *) rec; |
||||
|
||||
appendStringInfo(buf, "rel %u/%u/%u targetBlk %u", |
||||
xlrec->node.spcNode, xlrec->node.dbNode, |
||||
xlrec->node.relNode, xlrec->targetBlk); |
||||
} |
||||
} |
||||
|
||||
const char * |
||||
brin_identify(uint8 info) |
||||
{ |
||||
const char *id = NULL; |
||||
|
||||
switch (info & ~XLR_INFO_MASK) |
||||
{ |
||||
case XLOG_BRIN_CREATE_INDEX: |
||||
id = "CREATE_INDEX"; |
||||
break; |
||||
case XLOG_BRIN_INSERT: |
||||
id = "INSERT"; |
||||
break; |
||||
case XLOG_BRIN_INSERT | XLOG_BRIN_INIT_PAGE: |
||||
id = "INSERT+INIT"; |
||||
break; |
||||
case XLOG_BRIN_UPDATE: |
||||
id = "UPDATE"; |
||||
break; |
||||
case XLOG_BRIN_UPDATE | XLOG_BRIN_INIT_PAGE: |
||||
id = "UPDATE+INIT"; |
||||
break; |
||||
case XLOG_BRIN_SAMEPAGE_UPDATE: |
||||
id = "SAMEPAGE_UPDATE"; |
||||
break; |
||||
case XLOG_BRIN_REVMAP_EXTEND: |
||||
id = "REVMAP_EXTEND"; |
||||
break; |
||||
} |
||||
|
||||
return id; |
||||
} |
@ -0,0 +1,52 @@ |
||||
/*
|
||||
* AM-callable functions for BRIN indexes |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin.h |
||||
*/ |
||||
#ifndef BRIN_H |
||||
#define BRIN_H |
||||
|
||||
#include "fmgr.h" |
||||
#include "nodes/execnodes.h" |
||||
#include "utils/relcache.h" |
||||
|
||||
|
||||
/*
|
||||
* prototypes for functions in brin.c (external entry points for BRIN) |
||||
*/ |
||||
extern Datum brinbuild(PG_FUNCTION_ARGS); |
||||
extern Datum brinbuildempty(PG_FUNCTION_ARGS); |
||||
extern Datum brininsert(PG_FUNCTION_ARGS); |
||||
extern Datum brinbeginscan(PG_FUNCTION_ARGS); |
||||
extern Datum bringettuple(PG_FUNCTION_ARGS); |
||||
extern Datum bringetbitmap(PG_FUNCTION_ARGS); |
||||
extern Datum brinrescan(PG_FUNCTION_ARGS); |
||||
extern Datum brinendscan(PG_FUNCTION_ARGS); |
||||
extern Datum brinmarkpos(PG_FUNCTION_ARGS); |
||||
extern Datum brinrestrpos(PG_FUNCTION_ARGS); |
||||
extern Datum brinbulkdelete(PG_FUNCTION_ARGS); |
||||
extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS); |
||||
extern Datum brincanreturn(PG_FUNCTION_ARGS); |
||||
extern Datum brincostestimate(PG_FUNCTION_ARGS); |
||||
extern Datum brinoptions(PG_FUNCTION_ARGS); |
||||
|
||||
/*
|
||||
* Storage type for BRIN's reloptions |
||||
*/ |
||||
typedef struct BrinOptions |
||||
{ |
||||
int32 vl_len_; /* varlena header (do not touch directly!) */ |
||||
BlockNumber pagesPerRange; |
||||
} BrinOptions; |
||||
|
||||
#define BRIN_DEFAULT_PAGES_PER_RANGE 128 |
||||
#define BrinGetPagesPerRange(relation) \ |
||||
((relation)->rd_options ? \
|
||||
((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
|
||||
BRIN_DEFAULT_PAGES_PER_RANGE) |
||||
|
||||
#endif /* BRIN_H */ |
@ -0,0 +1,88 @@ |
||||
/*
|
||||
* brin_internal.h |
||||
* internal declarations for BRIN indexes |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin_internal.h |
||||
*/ |
||||
#ifndef BRIN_INTERNAL_H |
||||
#define BRIN_INTERNAL_H |
||||
|
||||
#include "fmgr.h" |
||||
#include "storage/buf.h" |
||||
#include "storage/bufpage.h" |
||||
#include "storage/off.h" |
||||
#include "utils/relcache.h" |
||||
|
||||
|
||||
/*
|
||||
* A BrinDesc is a struct designed to enable decoding a BRIN tuple from the |
||||
* on-disk format to an in-memory tuple and vice-versa. |
||||
*/ |
||||
|
||||
/* struct returned by "OpcInfo" amproc */ |
||||
typedef struct BrinOpcInfo |
||||
{ |
||||
/* Number of columns stored in an index column of this opclass */ |
||||
uint16 oi_nstored; |
||||
|
||||
/* Opaque pointer for the opclass' private use */ |
||||
void *oi_opaque; |
||||
|
||||
/* Type IDs of the stored columns */ |
||||
Oid oi_typids[FLEXIBLE_ARRAY_MEMBER]; |
||||
} BrinOpcInfo; |
||||
|
||||
/* the size of a BrinOpcInfo for the given number of columns */ |
||||
#define SizeofBrinOpcInfo(ncols) \ |
||||
(offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols) |
||||
|
||||
typedef struct BrinDesc |
||||
{ |
||||
/* Containing memory context */ |
||||
MemoryContext bd_context; |
||||
|
||||
/* the index relation itself */ |
||||
Relation bd_index; |
||||
|
||||
/* tuple descriptor of the index relation */ |
||||
TupleDesc bd_tupdesc; |
||||
|
||||
/* cached copy for on-disk tuples; generated at first use */ |
||||
TupleDesc bd_disktdesc; |
||||
|
||||
/* total number of Datum entries that are stored on-disk for all columns */ |
||||
int bd_totalstored; |
||||
|
||||
/* per-column info; bd_tupdesc->natts entries long */ |
||||
BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER]; |
||||
} BrinDesc; |
||||
|
||||
/*
|
||||
* Globally-known function support numbers for BRIN indexes. Individual |
||||
* opclasses define their own function support numbers, which must not collide |
||||
* with the definitions here. |
||||
*/ |
||||
#define BRIN_PROCNUM_OPCINFO 1 |
||||
#define BRIN_PROCNUM_ADDVALUE 2 |
||||
#define BRIN_PROCNUM_CONSISTENT 3 |
||||
#define BRIN_PROCNUM_UNION 4 |
||||
/* procedure numbers up to 10 are reserved for BRIN future expansion */ |
||||
|
||||
#define BRIN_DEBUG |
||||
|
||||
/* we allow debug if using GCC; otherwise don't bother */ |
||||
#if defined(BRIN_DEBUG) && defined(__GNUC__) |
||||
#define BRIN_elog(level, ...) elog(level, __VA_ARGS__) |
||||
#else |
||||
#define BRIN_elog(a) void(0) |
||||
#endif |
||||
|
||||
/* brin.c */ |
||||
extern BrinDesc *brin_build_desc(Relation rel); |
||||
extern void brin_free_desc(BrinDesc *bdesc); |
||||
|
||||
#endif /* BRIN_INTERNAL_H */ |
@ -0,0 +1,70 @@ |
||||
/*
|
||||
* brin_page.h |
||||
* Prototypes and definitions for BRIN page layouts |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin_page.h |
||||
* |
||||
* NOTES |
||||
* |
||||
* These structs should really be private to specific BRIN files, but it's |
||||
* useful to have them here so that they can be used by pageinspect and similar |
||||
* tools. |
||||
*/ |
||||
#ifndef BRIN_PAGE_H |
||||
#define BRIN_PAGE_H |
||||
|
||||
#include "storage/block.h" |
||||
#include "storage/itemptr.h" |
||||
|
||||
/* special space on all BRIN pages stores a "type" identifier */ |
||||
#define BRIN_PAGETYPE_META 0xF091 |
||||
#define BRIN_PAGETYPE_REVMAP 0xF092 |
||||
#define BRIN_PAGETYPE_REGULAR 0xF093 |
||||
|
||||
#define BRIN_PAGE_TYPE(page) \ |
||||
(((BrinSpecialSpace *) PageGetSpecialPointer(page))->type) |
||||
#define BRIN_IS_REVMAP_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REVMAP) |
||||
#define BRIN_IS_REGULAR_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REGULAR) |
||||
|
||||
/* flags for BrinSpecialSpace */ |
||||
#define BRIN_EVACUATE_PAGE (1 << 0) |
||||
|
||||
typedef struct BrinSpecialSpace |
||||
{ |
||||
uint16 flags; |
||||
uint16 type; |
||||
} BrinSpecialSpace; |
||||
|
||||
/* Metapage definitions */ |
||||
typedef struct BrinMetaPageData |
||||
{ |
||||
uint32 brinMagic; |
||||
uint32 brinVersion; |
||||
BlockNumber pagesPerRange; |
||||
BlockNumber lastRevmapPage; |
||||
} BrinMetaPageData; |
||||
|
||||
#define BRIN_CURRENT_VERSION 1 |
||||
#define BRIN_META_MAGIC 0xA8109CFA |
||||
|
||||
#define BRIN_METAPAGE_BLKNO 0 |
||||
|
||||
/* Definitions for revmap pages */ |
||||
typedef struct RevmapContents |
||||
{ |
||||
ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */ |
||||
} RevmapContents; |
||||
|
||||
#define REVMAP_CONTENT_SIZE \ |
||||
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
|
||||
offsetof(RevmapContents, rm_tids) - \
|
||||
MAXALIGN(sizeof(BrinSpecialSpace))) |
||||
/* max num of items in the array */ |
||||
#define REVMAP_PAGE_MAXITEMS \ |
||||
(REVMAP_CONTENT_SIZE / sizeof(ItemPointerData)) |
||||
|
||||
#endif /* BRIN_PAGE_H */ |
@ -0,0 +1,36 @@ |
||||
/*
|
||||
* brin_pageops.h |
||||
* Prototypes for operating on BRIN pages. |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin_pageops.h |
||||
*/ |
||||
#ifndef BRIN_PAGEOPS_H |
||||
#define BRIN_PAGEOPS_H |
||||
|
||||
#include "access/brin_revmap.h" |
||||
|
||||
extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, BlockNumber heapBlk, |
||||
Buffer oldbuf, OffsetNumber oldoff, |
||||
const BrinTuple *origtup, Size origsz, |
||||
const BrinTuple *newtup, Size newsz, |
||||
bool samepage); |
||||
extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz, |
||||
Size newsz); |
||||
extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, |
||||
BrinTuple *tup, Size itemsz); |
||||
|
||||
extern void brin_page_init(Page page, uint16 type); |
||||
extern void brin_metapage_init(Page page, BlockNumber pagesPerRange, |
||||
uint16 version); |
||||
|
||||
extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf); |
||||
extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, |
||||
BrinRevmap *revmap, Buffer buf); |
||||
|
||||
#endif /* BRIN_PAGEOPS_H */ |
@ -0,0 +1,39 @@ |
||||
/*
|
||||
* brin_revmap.h |
||||
* Prototypes for BRIN reverse range maps |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin_revmap.h |
||||
*/ |
||||
|
||||
#ifndef BRIN_REVMAP_H |
||||
#define BRIN_REVMAP_H |
||||
|
||||
#include "access/brin_tuple.h" |
||||
#include "storage/block.h" |
||||
#include "storage/buf.h" |
||||
#include "storage/itemptr.h" |
||||
#include "storage/off.h" |
||||
#include "utils/relcache.h" |
||||
|
||||
/* struct definition lives in brin_revmap.c */ |
||||
typedef struct BrinRevmap BrinRevmap; |
||||
|
||||
extern BrinRevmap *brinRevmapInitialize(Relation idxrel, |
||||
BlockNumber *pagesPerRange); |
||||
extern void brinRevmapTerminate(BrinRevmap *revmap); |
||||
|
||||
extern void brinRevmapExtend(BrinRevmap *revmap, |
||||
BlockNumber heapBlk); |
||||
extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap, |
||||
BlockNumber heapBlk); |
||||
extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange, |
||||
BlockNumber heapBlk, ItemPointerData tid); |
||||
extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap, |
||||
BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, |
||||
Size *size, int mode); |
||||
|
||||
#endif /* BRIN_REVMAP_H */ |
@ -0,0 +1,96 @@ |
||||
/*
|
||||
* brin_tuple.h |
||||
* Declarations for dealing with BRIN-specific tuples. |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* src/include/access/brin_tuple.h |
||||
*/ |
||||
#ifndef BRIN_TUPLE_H |
||||
#define BRIN_TUPLE_H |
||||
|
||||
#include "access/brin_internal.h" |
||||
#include "access/tupdesc.h" |
||||
|
||||
|
||||
/*
|
||||
* A BRIN index stores one index tuple per page range. Each index tuple |
||||
* has one BrinValues struct for each indexed column; in turn, each BrinValues |
||||
* has (besides the null flags) an array of Datum whose size is determined by |
||||
* the opclass. |
||||
*/ |
||||
typedef struct BrinValues |
||||
{ |
||||
AttrNumber bv_attno; /* index attribute number */ |
||||
bool bv_hasnulls; /* is there any nulls in the page range? */ |
||||
bool bv_allnulls; /* are all values nulls in the page range? */ |
||||
Datum *bv_values; /* current accumulated values */ |
||||
} BrinValues; |
||||
|
||||
/*
|
||||
* This struct is used to represent an in-memory index tuple. The values can |
||||
* only be meaningfully decoded with an appropriate BrinDesc. |
||||
*/ |
||||
typedef struct BrinMemTuple |
||||
{ |
||||
bool bt_placeholder; /* this is a placeholder tuple */ |
||||
BlockNumber bt_blkno; /* heap blkno that the tuple is for */ |
||||
MemoryContext bt_context; /* memcxt holding the dt_column values */ |
||||
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]; |
||||
} BrinMemTuple; |
||||
|
||||
/*
|
||||
* An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with |
||||
* room for 2 null bits (two bits for each indexed column); an opclass-defined |
||||
* number of Datum values for each column follow. |
||||
*/ |
||||
typedef struct BrinTuple |
||||
{ |
||||
/* heap block number that the tuple is for */ |
||||
BlockNumber bt_blkno; |
||||
|
||||
/* ---------------
|
||||
* mt_info is laid out in the following fashion: |
||||
* |
||||
* 7th (high) bit: has nulls |
||||
* 6th bit: is placeholder tuple |
||||
* 5th bit: unused |
||||
* 4-0 bit: offset of data |
||||
* --------------- |
||||
*/ |
||||
uint8 bt_info; |
||||
} BrinTuple; |
||||
|
||||
#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8)) |
||||
|
||||
/*
|
||||
* t_info manipulation macros |
||||
*/ |
||||
#define BRIN_OFFSET_MASK 0x1F |
||||
/* bit 0x20 is not used at present */ |
||||
#define BRIN_PLACEHOLDER_MASK 0x40 |
||||
#define BRIN_NULLS_MASK 0x80 |
||||
|
||||
#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK)) |
||||
#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0) |
||||
#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0) |
||||
|
||||
|
||||
extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, |
||||
BrinMemTuple *tuple, Size *size); |
||||
extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc, |
||||
BlockNumber blkno, Size *size); |
||||
extern void brin_free_tuple(BrinTuple *tuple); |
||||
extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len); |
||||
extern bool brin_tuples_equal(const BrinTuple *a, Size alen, |
||||
const BrinTuple *b, Size blen); |
||||
|
||||
extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc); |
||||
extern void brin_memtuple_initialize(BrinMemTuple *dtuple, |
||||
BrinDesc *brdesc); |
||||
extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc, |
||||
BrinTuple *tuple); |
||||
|
||||
#endif /* BRIN_TUPLE_H */ |
@ -0,0 +1,109 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* brin_xlog.h |
||||
* POSTGRES BRIN access XLOG definitions. |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* src/include/access/brin_xlog.h |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef BRIN_XLOG_H |
||||
#define BRIN_XLOG_H |
||||
|
||||
#include "access/xlogrecord.h" |
||||
#include "lib/stringinfo.h" |
||||
#include "storage/bufpage.h" |
||||
#include "storage/itemptr.h" |
||||
#include "storage/relfilenode.h" |
||||
#include "utils/relcache.h" |
||||
|
||||
|
||||
/*
|
||||
* WAL record definitions for BRIN's WAL operations |
||||
* |
||||
* XLOG allows to store some information in high 4 bits of log |
||||
* record xl_info field. |
||||
*/ |
||||
#define XLOG_BRIN_CREATE_INDEX 0x00 |
||||
#define XLOG_BRIN_INSERT 0x10 |
||||
#define XLOG_BRIN_UPDATE 0x20 |
||||
#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30 |
||||
#define XLOG_BRIN_REVMAP_EXTEND 0x40 |
||||
#define XLOG_BRIN_REVMAP_VACUUM 0x50 |
||||
|
||||
#define XLOG_BRIN_OPMASK 0x70 |
||||
/*
|
||||
* When we insert the first item on a new page, we restore the entire page in |
||||
* redo. |
||||
*/ |
||||
#define XLOG_BRIN_INIT_PAGE 0x80 |
||||
|
||||
/* This is what we need to know about a BRIN index create */ |
||||
typedef struct xl_brin_createidx |
||||
{ |
||||
BlockNumber pagesPerRange; |
||||
RelFileNode node; |
||||
uint16 version; |
||||
} xl_brin_createidx; |
||||
#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) |
||||
|
||||
/*
|
||||
* This is what we need to know about a BRIN tuple insert |
||||
*/ |
||||
typedef struct xl_brin_insert |
||||
{ |
||||
RelFileNode node; |
||||
BlockNumber heapBlk; |
||||
|
||||
/* extra information needed to update the revmap */ |
||||
BlockNumber revmapBlk; |
||||
BlockNumber pagesPerRange; |
||||
|
||||
uint16 tuplen; |
||||
ItemPointerData tid; |
||||
/* tuple data follows at end of struct */ |
||||
} xl_brin_insert; |
||||
|
||||
#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData)) |
||||
|
||||
/*
|
||||
* A cross-page update is the same as an insert, but also store the old tid. |
||||
*/ |
||||
typedef struct xl_brin_update |
||||
{ |
||||
ItemPointerData oldtid; |
||||
xl_brin_insert new; |
||||
} xl_brin_update; |
||||
|
||||
#define SizeOfBrinUpdate (offsetof(xl_brin_update, new) + SizeOfBrinInsert) |
||||
|
||||
/* This is what we need to know about a BRIN tuple samepage update */ |
||||
typedef struct xl_brin_samepage_update |
||||
{ |
||||
RelFileNode node; |
||||
ItemPointerData tid; |
||||
/* tuple data follows at end of struct */ |
||||
} xl_brin_samepage_update; |
||||
|
||||
#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData)) |
||||
|
||||
/* This is what we need to know about a revmap extension */ |
||||
typedef struct xl_brin_revmap_extend |
||||
{ |
||||
RelFileNode node; |
||||
BlockNumber targetBlk; |
||||
} xl_brin_revmap_extend; |
||||
|
||||
#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \ |
||||
sizeof(BlockNumber)) |
||||
|
||||
|
||||
extern void brin_desc(StringInfo buf, XLogRecord *record); |
||||
extern void brin_redo(XLogRecPtr lsn, XLogRecord *record); |
||||
extern const char *brin_identify(uint8 info); |
||||
|
||||
#endif /* BRIN_XLOG_H */ |
@ -0,0 +1,179 @@ |
||||
SET synchronous_commit = 0; |
||||
CREATE TABLE brintest (byteacol bytea, |
||||
charcol "char", |
||||
namecol name, |
||||
int8col bigint, |
||||
int2col smallint, |
||||
int4col integer, |
||||
textcol text, |
||||
oidcol oid, |
||||
tidcol tid, |
||||
float4col real, |
||||
float8col double precision, |
||||
macaddrcol macaddr, |
||||
inetcol inet, |
||||
bpcharcol character, |
||||
datecol date, |
||||
timecol time without time zone, |
||||
timestampcol timestamp without time zone, |
||||
timestamptzcol timestamp with time zone, |
||||
intervalcol interval, |
||||
timetzcol time with time zone, |
||||
bitcol bit(10), |
||||
varbitcol bit varying(16), |
||||
numericcol numeric, |
||||
uuidcol uuid, |
||||
lsncol pg_lsn |
||||
) WITH (fillfactor=50); |
||||
INSERT INTO brintest SELECT |
||||
repeat(stringu1, 42)::bytea, |
||||
substr(stringu1, 1, 1)::"char", |
||||
stringu1::name, 142857 * tenthous, |
||||
thousand, |
||||
twothousand, |
||||
repeat(stringu1, 42), |
||||
unique1::oid, |
||||
format('(%s,%s)', tenthous, twenty)::tid, |
||||
(four + 1.0)/(hundred+1), |
||||
odd::float8 / (tenthous + 1), |
||||
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, |
||||
inet '10.2.3.4' + tenthous, |
||||
substr(stringu1, 1, 1)::bpchar, |
||||
date '1995-08-15' + tenthous, |
||||
time '01:20:30' + thousand * interval '18.5 second', |
||||
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', |
||||
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', |
||||
justify_days(justify_hours(tenthous * interval '12 minutes')), |
||||
timetz '01:30:20' + hundred * interval '15 seconds', |
||||
thousand::bit(10), |
||||
tenthous::bit(16)::varbit, |
||||
tenthous::numeric(36,30) * fivethous * even / (hundred + 1), |
||||
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, |
||||
format('%s/%s%s', odd, even, tenthous)::pg_lsn |
||||
FROM tenk1; |
||||
CREATE INDEX brinidx ON brintest USING brin ( |
||||
byteacol, |
||||
charcol, |
||||
namecol, |
||||
int8col, |
||||
int2col, |
||||
int4col, |
||||
textcol, |
||||
oidcol, |
||||
tidcol, |
||||
float4col, |
||||
float8col, |
||||
macaddrcol, |
||||
inetcol, |
||||
bpcharcol, |
||||
datecol, |
||||
timecol, |
||||
timestampcol, |
||||
timestamptzcol, |
||||
intervalcol, |
||||
timetzcol, |
||||
bitcol, |
||||
varbitcol, |
||||
numericcol, |
||||
uuidcol, |
||||
lsncol |
||||
) with (pages_per_range = 1); |
||||
CREATE TABLE brinopers (colname name, op text[], value text[], |
||||
check (cardinality(op) = cardinality(value))); |
||||
INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); |
||||
INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); |
||||
INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); |
||||
INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}'); |
||||
INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}'); |
||||
INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}'); |
||||
INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}'); |
||||
INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); |
||||
INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}'); |
||||
INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}'); |
||||
INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}'); |
||||
INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}'); |
||||
INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}'); |
||||
INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}'); |
||||
INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}'); |
||||
INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}'); |
||||
INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}'); |
||||
INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}'); |
||||
INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}'); |
||||
INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}'); |
||||
DO $x$ |
||||
DECLARE |
||||
r record; |
||||
tabname text; |
||||
tabname_ss text; |
||||
count int; |
||||
query text; |
||||
plan text; |
||||
BEGIN |
||||
FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP |
||||
tabname := format('qry_%s', r.row_number); |
||||
tabname_ss := tabname || '_ss'; |
||||
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, |
||||
tabname, r.colname, r.oper, r.value); |
||||
-- run the query using the brin index |
||||
SET enable_seqscan = 0; |
||||
SET enable_bitmapscan = 1; |
||||
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname); |
||||
EXECUTE query; |
||||
|
||||
-- run the query using a seqscan |
||||
SET enable_seqscan = 1; |
||||
SET enable_bitmapscan = 0; |
||||
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, |
||||
tabname_ss, r.colname, r.oper, r.value); |
||||
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss); |
||||
EXECUTE query; |
||||
|
||||
-- make sure both return the same results |
||||
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss); |
||||
GET DIAGNOSTICS count = ROW_COUNT; |
||||
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; |
||||
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname); |
||||
GET DIAGNOSTICS count = ROW_COUNT; |
||||
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; |
||||
end loop; |
||||
end; |
||||
$x$; |
||||
INSERT INTO brintest SELECT |
||||
repeat(stringu1, 42)::bytea, |
||||
substr(stringu1, 1, 1)::"char", |
||||
stringu1::name, 142857 * tenthous, |
||||
thousand, |
||||
twothousand, |
||||
repeat(stringu1, 42), |
||||
unique1::oid, |
||||
format('(%s,%s)', tenthous, twenty)::tid, |
||||
(four + 1.0)/(hundred+1), |
||||
odd::float8 / (tenthous + 1), |
||||
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, |
||||
inet '10.2.3.4' + tenthous, |
||||
substr(stringu1, 1, 1)::bpchar, |
||||
date '1995-08-15' + tenthous, |
||||
time '01:20:30' + thousand * interval '18.5 second', |
||||
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', |
||||
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', |
||||
justify_days(justify_hours(tenthous * interval '12 minutes')), |
||||
timetz '01:30:20' + hundred * interval '15 seconds', |
||||
thousand::bit(10), |
||||
tenthous::bit(16)::varbit, |
||||
tenthous::numeric(36,30) * fivethous * even / (hundred + 1), |
||||
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, |
||||
format('%s/%s%s', odd, even, tenthous)::pg_lsn |
||||
FROM tenk1; |
||||
SELECT brin_summarize_new_values('brinidx'::regclass); |
||||
brin_summarize_new_values |
||||
--------------------------- |
||||
2000 |
||||
(1 row) |
||||
|
||||
UPDATE brintest SET int8col = int8col * int4col; |
||||
SET synchronous_commit = 1; |
@ -0,0 +1,184 @@ |
||||
SET synchronous_commit = 0; |
||||
|
||||
CREATE TABLE brintest (byteacol bytea, |
||||
charcol "char", |
||||
namecol name, |
||||
int8col bigint, |
||||
int2col smallint, |
||||
int4col integer, |
||||
textcol text, |
||||
oidcol oid, |
||||
tidcol tid, |
||||
float4col real, |
||||
float8col double precision, |
||||
macaddrcol macaddr, |
||||
inetcol inet, |
||||
bpcharcol character, |
||||
datecol date, |
||||
timecol time without time zone, |
||||
timestampcol timestamp without time zone, |
||||
timestamptzcol timestamp with time zone, |
||||
intervalcol interval, |
||||
timetzcol time with time zone, |
||||
bitcol bit(10), |
||||
varbitcol bit varying(16), |
||||
numericcol numeric, |
||||
uuidcol uuid, |
||||
lsncol pg_lsn |
||||
) WITH (fillfactor=50); |
||||
|
||||
INSERT INTO brintest SELECT |
||||
repeat(stringu1, 42)::bytea, |
||||
substr(stringu1, 1, 1)::"char", |
||||
stringu1::name, 142857 * tenthous, |
||||
thousand, |
||||
twothousand, |
||||
repeat(stringu1, 42), |
||||
unique1::oid, |
||||
format('(%s,%s)', tenthous, twenty)::tid, |
||||
(four + 1.0)/(hundred+1), |
||||
odd::float8 / (tenthous + 1), |
||||
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, |
||||
inet '10.2.3.4' + tenthous, |
||||
substr(stringu1, 1, 1)::bpchar, |
||||
date '1995-08-15' + tenthous, |
||||
time '01:20:30' + thousand * interval '18.5 second', |
||||
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', |
||||
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', |
||||
justify_days(justify_hours(tenthous * interval '12 minutes')), |
||||
timetz '01:30:20' + hundred * interval '15 seconds', |
||||
thousand::bit(10), |
||||
tenthous::bit(16)::varbit, |
||||
tenthous::numeric(36,30) * fivethous * even / (hundred + 1), |
||||
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, |
||||
format('%s/%s%s', odd, even, tenthous)::pg_lsn |
||||
FROM tenk1; |
||||
|
||||
CREATE INDEX brinidx ON brintest USING brin ( |
||||
byteacol, |
||||
charcol, |
||||
namecol, |
||||
int8col, |
||||
int2col, |
||||
int4col, |
||||
textcol, |
||||
oidcol, |
||||
tidcol, |
||||
float4col, |
||||
float8col, |
||||
macaddrcol, |
||||
inetcol, |
||||
bpcharcol, |
||||
datecol, |
||||
timecol, |
||||
timestampcol, |
||||
timestamptzcol, |
||||
intervalcol, |
||||
timetzcol, |
||||
bitcol, |
||||
varbitcol, |
||||
numericcol, |
||||
uuidcol, |
||||
lsncol |
||||
) with (pages_per_range = 1); |
||||
|
||||
CREATE TABLE brinopers (colname name, op text[], value text[], |
||||
check (cardinality(op) = cardinality(value))); |
||||
|
||||
INSERT INTO brinopers VALUES ('byteacol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); |
||||
INSERT INTO brinopers VALUES ('charcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); |
||||
INSERT INTO brinopers VALUES ('namecol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAAA, AAAAAA, AAAAAA}'); |
||||
INSERT INTO brinopers VALUES ('int8col', '{>, >=, =, <=, <}', '{1428427143, 1428427143, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('int2col', '{>, >=, =, <=, <}', '{999, 999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('int4col', '{>, >=, =, <=, <}', '{1999, 1999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('textcol', '{>, >=, =, <=, <}', '{ZZAAAA, ZZAAAA, AAAAA, AAAAA, AAAAA}'); |
||||
INSERT INTO brinopers VALUES ('oidcol', '{>, >=, =, <=, <}', '{9999, 9999, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('tidcol', '{>, >=, =, <=, <}', '{"(9999,19)", "(9999,19)", "(0,0)", "(0,0)", "(0,0)"}'); |
||||
INSERT INTO brinopers VALUES ('float4col', '{>, >=, =, <=, <}', '{1, 1, 0.0103093, 0.0103093, 0.0103093}'); |
||||
INSERT INTO brinopers VALUES ('float8col', '{>, >=, =, <=, <}', '{1.98, 1.98, 0, 0, 0}'); |
||||
INSERT INTO brinopers VALUES ('inetcol', '{>, >=, =, <=, <}', '{10.2.42.19, 10.2.42.19, 10.2.3.4, 10.2.3.4, 10.2.3.4}'); |
||||
INSERT INTO brinopers VALUES ('bpcharcol', '{>, >=, =, <=, <}', '{Z, Z, A, A, A}'); |
||||
INSERT INTO brinopers VALUES ('datecol', '{>, >=, =, <=, <}', '{2022-12-30, 2022-12-30, 1995-08-15, 1995-08-15, 1995-08-15}'); |
||||
INSERT INTO brinopers VALUES ('timecol', '{>, >=, =, <=, <}', '{06:28:31.5, 06:28:31.5, 01:20:30, 01:20:30, 01:20:30}'); |
||||
INSERT INTO brinopers VALUES ('timestampcol', '{>, >=, =, <=, <}', '{1984-01-20 22:42:21, 1984-01-20 22:42:21, 1942-07-23 03:05:09, 1942-07-23 03:05:09, 1942-07-23 03:05:09}'); |
||||
INSERT INTO brinopers VALUES ('timestamptzcol', '{>, >=, =, <=, <}', '{1972-11-20 19:00:00-03, 1972-11-20 19:00:00-03, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04, 1972-10-10 03:00:00-04}'); |
||||
INSERT INTO brinopers VALUES ('intervalcol', '{>, >=, =, <=, <}', '{2 mons 23 days 07:48:00, 2 mons 23 days 07:48:00, 00:00:00, 00:00:00, 00:00:00}'); |
||||
INSERT INTO brinopers VALUES ('timetzcol', '{>, >=, =, <=, <}', '{01:55:05-03, 01:55:05-03, 01:30:20-03, 01:30:20-03, 01:30:20-03}'); |
||||
INSERT INTO brinopers VALUES ('numericcol', '{>, >=, =, <=, <}', '{99470151.9, 99470151.9, 0.00, 0.01, 0.01}'); |
||||
INSERT INTO brinopers VALUES ('macaddrcol', '{>, >=, =, <=, <}', '{ff:fe:00:00:00:00, ff:fe:00:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00, 00:00:01:00:00:00}'); |
||||
INSERT INTO brinopers VALUES ('bitcol', '{>, >=, =, <=, <}', '{1111111000, 1111111000, 0000000010, 0000000010, 0000000010}'); |
||||
INSERT INTO brinopers VALUES ('varbitcol', '{>, >=, =, <=, <}', '{1111111111111000, 1111111111111000, 0000000000000100, 0000000000000100, 0000000000000100}'); |
||||
INSERT INTO brinopers VALUES ('uuidcol', '{>, >=, =, <=, <}', '{99989998-9998-9998-9998-999899989998, 99989998-9998-9998-9998-999899989998, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040004, 00040004-0004-0004-0004-000400040005}'); |
||||
INSERT INTO brinopers VALUES ('lsncol', '{>, >=, =, <=, <}', '{198/1999799, 198/1999799, 30/312815, 0/1200, 0/1200}'); |
||||
|
||||
DO $x$ |
||||
DECLARE |
||||
r record; |
||||
tabname text; |
||||
tabname_ss text; |
||||
count int; |
||||
query text; |
||||
plan text; |
||||
BEGIN |
||||
FOR r IN SELECT row_number() OVER (), colname, oper, value[ordinality] FROM brinopers, unnest(op) WITH ORDINALITY AS oper LOOP |
||||
tabname := format('qry_%s', r.row_number); |
||||
tabname_ss := tabname || '_ss'; |
||||
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, |
||||
tabname, r.colname, r.oper, r.value); |
||||
-- run the query using the brin index |
||||
SET enable_seqscan = 0; |
||||
SET enable_bitmapscan = 1; |
||||
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP*/', tabname); |
||||
EXECUTE query; |
||||
|
||||
-- run the query using a seqscan |
||||
SET enable_seqscan = 1; |
||||
SET enable_bitmapscan = 0; |
||||
query = format($y$INSERT INTO %s SELECT ctid FROM brintest WHERE %s %s %L $y$, |
||||
tabname_ss, r.colname, r.oper, r.value); |
||||
EXECUTE format('create temp table %s (tid tid) /* ON COMMIT DROP */', tabname_ss); |
||||
EXECUTE query; |
||||
|
||||
-- make sure both return the same results |
||||
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname, tabname_ss); |
||||
GET DIAGNOSTICS count = ROW_COUNT; |
||||
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; |
||||
EXECUTE format('SELECT * from %s EXCEPT ALL SELECT * FROM %s', tabname_ss, tabname); |
||||
GET DIAGNOSTICS count = ROW_COUNT; |
||||
IF count <> 0 THEN RAISE EXCEPTION 'something not right in %: count %', r, count; END IF; |
||||
end loop; |
||||
end; |
||||
$x$; |
||||
|
||||
INSERT INTO brintest SELECT |
||||
repeat(stringu1, 42)::bytea, |
||||
substr(stringu1, 1, 1)::"char", |
||||
stringu1::name, 142857 * tenthous, |
||||
thousand, |
||||
twothousand, |
||||
repeat(stringu1, 42), |
||||
unique1::oid, |
||||
format('(%s,%s)', tenthous, twenty)::tid, |
||||
(four + 1.0)/(hundred+1), |
||||
odd::float8 / (tenthous + 1), |
||||
format('%s:00:%s:00:%s:00', to_hex(odd), to_hex(even), to_hex(hundred))::macaddr, |
||||
inet '10.2.3.4' + tenthous, |
||||
substr(stringu1, 1, 1)::bpchar, |
||||
date '1995-08-15' + tenthous, |
||||
time '01:20:30' + thousand * interval '18.5 second', |
||||
timestamp '1942-07-23 03:05:09' + tenthous * interval '36.38 hours', |
||||
timestamptz '1972-10-10 03:00' + thousand * interval '1 hour', |
||||
justify_days(justify_hours(tenthous * interval '12 minutes')), |
||||
timetz '01:30:20' + hundred * interval '15 seconds', |
||||
thousand::bit(10), |
||||
tenthous::bit(16)::varbit, |
||||
tenthous::numeric(36,30) * fivethous * even / (hundred + 1), |
||||
format('%s%s-%s-%s-%s-%s%s%s', to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'), to_char(tenthous, 'FM0000'))::uuid, |
||||
format('%s/%s%s', odd, even, tenthous)::pg_lsn |
||||
FROM tenk1; |
||||
|
||||
SELECT brin_summarize_new_values('brinidx'::regclass); |
||||
|
||||
UPDATE brintest SET int8col = int8col * int4col; |
||||
|
||||
SET synchronous_commit = 1; |
Loading…
Reference in new issue