mirror of https://github.com/postgres/postgres
Module provides new access method. It is actually a simple Bloom filter implemented as pgsql's index. It could give some benefits on search with large number of columns. Module is a single way to test generic WAL interface committed earlier. Author: Teodor Sigaev, Alexander Korotkov Reviewers: Aleksander Alekseev, Michael Paquier, Jim Nasbypull/11/head
parent
4e56e5a6de
commit
9ee014fc89
@ -0,0 +1,4 @@ |
||||
# Generated subdirectories |
||||
/log/ |
||||
/results/ |
||||
/tmp_check/ |
@ -0,0 +1,24 @@ |
||||
# contrib/bloom/Makefile
|
||||
|
||||
MODULE_big = bloom
|
||||
OBJS = blcost.o blinsert.o blscan.o blutils.o blvacuum.o blvalidate.o $(WIN32RES)
|
||||
|
||||
EXTENSION = bloom
|
||||
DATA = bloom--1.0.sql
|
||||
PGFILEDESC = "bloom access method - signature file based index"
|
||||
|
||||
REGRESS = bloom
|
||||
|
||||
ifdef USE_PGXS |
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS) |
||||
else |
||||
subdir = contrib/bloom
|
||||
top_builddir = ../..
|
||||
include $(top_builddir)/src/Makefile.global |
||||
include $(top_srcdir)/contrib/contrib-global.mk |
||||
endif |
||||
|
||||
wal-check: temp-install |
||||
$(prove_check)
|
@ -0,0 +1,48 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blcost.c |
||||
* Cost estimate function for bloom indexes. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blcost.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "fmgr.h" |
||||
#include "optimizer/cost.h" |
||||
#include "utils/selfuncs.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
/*
|
||||
* Estimate cost of bloom index scan. |
||||
*/ |
||||
void |
||||
blcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, |
||||
Cost *indexStartupCost, Cost *indexTotalCost, |
||||
Selectivity *indexSelectivity, double *indexCorrelation) |
||||
{ |
||||
IndexOptInfo *index = path->indexinfo; |
||||
List *qinfos; |
||||
GenericCosts costs; |
||||
|
||||
/* Do preliminary analysis of indexquals */ |
||||
qinfos = deconstruct_indexquals(path); |
||||
|
||||
MemSet(&costs, 0, sizeof(costs)); |
||||
|
||||
/* We have to visit all index tuples anyway */ |
||||
costs.numIndexTuples = index->tuples; |
||||
|
||||
/* Use generic estimate */ |
||||
genericcostestimate(root, path, loop_count, qinfos, &costs); |
||||
|
||||
*indexStartupCost = costs.indexStartupCost; |
||||
*indexTotalCost = costs.indexTotalCost; |
||||
*indexSelectivity = costs.indexSelectivity; |
||||
*indexCorrelation = costs.indexCorrelation; |
||||
} |
@ -0,0 +1,313 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blinsert.c |
||||
* Bloom index build and insert functions. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blinsert.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/genam.h" |
||||
#include "access/generic_xlog.h" |
||||
#include "catalog/index.h" |
||||
#include "miscadmin.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/indexfsm.h" |
||||
#include "utils/memutils.h" |
||||
#include "utils/rel.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
PG_MODULE_MAGIC; |
||||
|
||||
/*
|
||||
* State of bloom index build. We accumulate one page data here before |
||||
* flushing it to buffer manager. |
||||
*/ |
||||
typedef struct |
||||
{ |
||||
BloomState blstate; /* bloom index state */ |
||||
MemoryContext tmpCtx; /* temporary memory context reset after
|
||||
* each tuple */ |
||||
char data[BLCKSZ]; /* cached page */ |
||||
int64 count; /* number of tuples in cached page */ |
||||
} BloomBuildState; |
||||
|
||||
/*
|
||||
* Flush page cached in BloomBuildState. |
||||
*/ |
||||
static void |
||||
flushCachedPage(Relation index, BloomBuildState *buildstate) |
||||
{ |
||||
Page page; |
||||
Buffer buffer = BloomNewBuffer(index); |
||||
GenericXLogState *state; |
||||
|
||||
state = GenericXLogStart(index); |
||||
page = GenericXLogRegister(state, buffer, true); |
||||
memcpy(page, buildstate->data, BLCKSZ); |
||||
GenericXLogFinish(state); |
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
|
||||
/*
|
||||
* (Re)initialize cached page in BloomBuildState. |
||||
*/ |
||||
static void |
||||
initCachedPage(BloomBuildState *buildstate) |
||||
{ |
||||
memset(buildstate->data, 0, BLCKSZ); |
||||
BloomInitPage(buildstate->data, 0); |
||||
buildstate->count = 0; |
||||
} |
||||
|
||||
/*
|
||||
* Per-tuple callback from IndexBuildHeapScan. |
||||
*/ |
||||
static void |
||||
bloomBuildCallback(Relation index, HeapTuple htup, Datum *values, |
||||
bool *isnull, bool tupleIsAlive, void *state) |
||||
{ |
||||
BloomBuildState *buildstate = (BloomBuildState *) state; |
||||
MemoryContext oldCtx; |
||||
BloomTuple *itup; |
||||
|
||||
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); |
||||
|
||||
itup = BloomFormTuple(&buildstate->blstate, &htup->t_self, values, isnull); |
||||
|
||||
/* Try to add next item to cached page */ |
||||
if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup)) |
||||
{ |
||||
/* Next item was added successfully */ |
||||
buildstate->count++; |
||||
} |
||||
else |
||||
{ |
||||
/* Cached page is full, flush it out and make a new one */ |
||||
flushCachedPage(index, buildstate); |
||||
|
||||
CHECK_FOR_INTERRUPTS(); |
||||
|
||||
initCachedPage(buildstate); |
||||
|
||||
if (BloomPageAddItem(&buildstate->blstate, buildstate->data, itup) == false) |
||||
{ |
||||
/* We shouldn't be here since we're inserting to the empty page */ |
||||
elog(ERROR, "can not add new tuple"); |
||||
} |
||||
} |
||||
|
||||
MemoryContextSwitchTo(oldCtx); |
||||
MemoryContextReset(buildstate->tmpCtx); |
||||
} |
||||
|
||||
/*
|
||||
* Build a new bloom index. |
||||
*/ |
||||
IndexBuildResult * |
||||
blbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
||||
{ |
||||
IndexBuildResult *result; |
||||
double reltuples; |
||||
BloomBuildState buildstate; |
||||
|
||||
if (RelationGetNumberOfBlocks(index) != 0) |
||||
elog(ERROR, "index \"%s\" already contains data", |
||||
RelationGetRelationName(index)); |
||||
|
||||
/* Initialize the meta page */ |
||||
BloomInitMetapage(index); |
||||
|
||||
/* Initialize the bloom build state */ |
||||
memset(&buildstate, 0, sizeof(buildstate)); |
||||
initBloomState(&buildstate.blstate, index); |
||||
buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, |
||||
"Bloom build temporary context", |
||||
ALLOCSET_DEFAULT_MINSIZE, |
||||
ALLOCSET_DEFAULT_INITSIZE, |
||||
ALLOCSET_DEFAULT_MAXSIZE); |
||||
initCachedPage(&buildstate); |
||||
|
||||
/* Do the heap scan */ |
||||
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, |
||||
bloomBuildCallback, (void *) &buildstate); |
||||
|
||||
/*
|
||||
* There are could be some items in cached page. Flush this page |
||||
* if needed. |
||||
*/ |
||||
if (buildstate.count > 0) |
||||
flushCachedPage(index, &buildstate); |
||||
|
||||
MemoryContextDelete(buildstate.tmpCtx); |
||||
|
||||
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
||||
result->heap_tuples = result->index_tuples = reltuples; |
||||
|
||||
return result; |
||||
} |
||||
|
||||
/*
|
||||
* Build an empty bloom index in the initialization fork. |
||||
*/ |
||||
void |
||||
blbuildempty(Relation index) |
||||
{ |
||||
if (RelationGetNumberOfBlocks(index) != 0) |
||||
elog(ERROR, "index \"%s\" already contains data", |
||||
RelationGetRelationName(index)); |
||||
|
||||
/* Initialize the meta page */ |
||||
BloomInitMetapage(index); |
||||
} |
||||
|
||||
/*
|
||||
* Insert new tuple to the bloom index. |
||||
*/ |
||||
bool |
||||
blinsert(Relation index, Datum *values, bool *isnull, |
||||
ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique) |
||||
{ |
||||
BloomState blstate; |
||||
BloomTuple *itup; |
||||
MemoryContext oldCtx; |
||||
MemoryContext insertCtx; |
||||
BloomMetaPageData *metaData; |
||||
Buffer buffer, |
||||
metaBuffer; |
||||
Page page, |
||||
metaPage; |
||||
BlockNumber blkno = InvalidBlockNumber; |
||||
OffsetNumber nStart; |
||||
GenericXLogState *state; |
||||
|
||||
insertCtx = AllocSetContextCreate(CurrentMemoryContext, |
||||
"Bloom insert temporary context", |
||||
ALLOCSET_DEFAULT_MINSIZE, |
||||
ALLOCSET_DEFAULT_INITSIZE, |
||||
ALLOCSET_DEFAULT_MAXSIZE); |
||||
|
||||
oldCtx = MemoryContextSwitchTo(insertCtx); |
||||
|
||||
initBloomState(&blstate, index); |
||||
itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); |
||||
|
||||
/*
|
||||
* At first, try to insert new tuple to the first page in notFullPage |
||||
* array. If success we don't need to modify the meta page. |
||||
*/ |
||||
metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); |
||||
LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); |
||||
metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); |
||||
|
||||
if (metaData->nEnd > metaData->nStart) |
||||
{ |
||||
Page page; |
||||
|
||||
blkno = metaData->notFullPage[metaData->nStart]; |
||||
|
||||
Assert(blkno != InvalidBlockNumber); |
||||
LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); |
||||
|
||||
buffer = ReadBuffer(index, blkno); |
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
state = GenericXLogStart(index); |
||||
page = GenericXLogRegister(state, buffer, false); |
||||
|
||||
if (BloomPageAddItem(&blstate, page, itup)) |
||||
{ |
||||
GenericXLogFinish(state); |
||||
UnlockReleaseBuffer(buffer); |
||||
ReleaseBuffer(metaBuffer); |
||||
MemoryContextSwitchTo(oldCtx); |
||||
MemoryContextDelete(insertCtx); |
||||
return false; |
||||
} |
||||
else |
||||
{ |
||||
GenericXLogAbort(state); |
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
/* First page in notFullPage isn't suitable */ |
||||
LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); |
||||
} |
||||
|
||||
/*
|
||||
* Try other pages in notFullPage array. We will have to change nStart in |
||||
* metapage. Thus, grab exclusive lock on metapage. |
||||
*/ |
||||
LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); |
||||
|
||||
state = GenericXLogStart(index); |
||||
metaPage = GenericXLogRegister(state, metaBuffer, false); |
||||
metaData = BloomPageGetMeta(metaPage); |
||||
|
||||
/*
|
||||
* Iterate over notFullPage array. Skip page we already tried first. |
||||
*/ |
||||
nStart = metaData->nStart; |
||||
if (metaData->nEnd > nStart && |
||||
blkno == metaData->notFullPage[nStart]) |
||||
nStart++; |
||||
|
||||
while (metaData->nEnd > nStart) |
||||
{ |
||||
blkno = metaData->notFullPage[nStart]; |
||||
Assert(blkno != InvalidBlockNumber); |
||||
|
||||
buffer = ReadBuffer(index, blkno); |
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
page = GenericXLogRegister(state, buffer, false); |
||||
|
||||
if (BloomPageAddItem(&blstate, page, itup)) |
||||
{ |
||||
metaData->nStart = nStart; |
||||
GenericXLogFinish(state); |
||||
UnlockReleaseBuffer(buffer); |
||||
UnlockReleaseBuffer(metaBuffer); |
||||
MemoryContextSwitchTo(oldCtx); |
||||
MemoryContextDelete(insertCtx); |
||||
return false; |
||||
} |
||||
else |
||||
{ |
||||
GenericXLogUnregister(state, buffer); |
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
nStart++; |
||||
} |
||||
|
||||
GenericXLogAbort(state); |
||||
|
||||
/*
|
||||
* Didn't find place to insert in notFullPage array. Allocate new page. |
||||
*/ |
||||
buffer = BloomNewBuffer(index); |
||||
|
||||
state = GenericXLogStart(index); |
||||
metaPage = GenericXLogRegister(state, metaBuffer, false); |
||||
metaData = BloomPageGetMeta(metaPage); |
||||
page = GenericXLogRegister(state, buffer, true); |
||||
BloomInitPage(page, 0); |
||||
BloomPageAddItem(&blstate, page, itup); |
||||
|
||||
metaData->nStart = 0; |
||||
metaData->nEnd = 1; |
||||
metaData->notFullPage[0] = BufferGetBlockNumber(buffer); |
||||
|
||||
GenericXLogFinish(state); |
||||
|
||||
UnlockReleaseBuffer(buffer); |
||||
UnlockReleaseBuffer(metaBuffer); |
||||
|
||||
return false; |
||||
} |
@ -0,0 +1,19 @@ |
||||
CREATE OR REPLACE FUNCTION blhandler(internal) |
||||
RETURNS index_am_handler |
||||
AS 'MODULE_PATHNAME' |
||||
LANGUAGE C; |
||||
|
||||
-- Access method |
||||
CREATE ACCESS METHOD bloom TYPE INDEX HANDLER blhandler; |
||||
|
||||
-- Opclasses |
||||
|
||||
CREATE OPERATOR CLASS int4_ops |
||||
DEFAULT FOR TYPE int4 USING bloom AS |
||||
OPERATOR 1 =(int4, int4), |
||||
FUNCTION 1 hashint4(int4); |
||||
|
||||
CREATE OPERATOR CLASS text_ops |
||||
DEFAULT FOR TYPE text USING bloom AS |
||||
OPERATOR 1 =(text, text), |
||||
FUNCTION 1 hashtext(text); |
@ -0,0 +1,5 @@ |
||||
# bloom extension |
||||
comment = 'bloom access method - signature file based index' |
||||
default_version = '1.0' |
||||
module_pathname = '$libdir/bloom' |
||||
relocatable = true |
@ -0,0 +1,178 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* bloom.h |
||||
* Header for bloom index. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/bloom.h |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef _BLOOM_H_ |
||||
#define _BLOOM_H_ |
||||
|
||||
#include "access/amapi.h" |
||||
#include "access/generic_xlog.h" |
||||
#include "access/itup.h" |
||||
#include "access/xlog.h" |
||||
#include "nodes/relation.h" |
||||
#include "fmgr.h" |
||||
|
||||
/* Support procedures numbers */ |
||||
#define BLOOM_HASH_PROC 1 |
||||
#define BLOOM_NPROC 1 |
||||
|
||||
/* Scan strategies */ |
||||
#define BLOOM_EQUAL_STRATEGY 1 |
||||
#define BLOOM_NSTRATEGIES 1 |
||||
|
||||
/* Opaque for bloom pages */ |
||||
typedef struct BloomPageOpaqueData |
||||
{ |
||||
OffsetNumber maxoff; |
||||
uint16 flags; |
||||
} BloomPageOpaqueData; |
||||
|
||||
typedef BloomPageOpaqueData *BloomPageOpaque; |
||||
|
||||
/* Bloom page flags */ |
||||
#define BLOOM_META (1<<0) |
||||
#define BLOOM_DELETED (2<<0) |
||||
|
||||
/* Macros for accessing bloom page structures */ |
||||
#define BloomPageGetOpaque(page) ((BloomPageOpaque) PageGetSpecialPointer(page)) |
||||
#define BloomPageGetMaxOffset(page) (BloomPageGetOpaque(page)->maxoff) |
||||
#define BloomPageIsMeta(page) (BloomPageGetOpaque(page)->flags & BLOOM_META) |
||||
#define BloomPageIsDeleted(page) (BloomPageGetOpaque(page)->flags & BLOOM_DELETED) |
||||
#define BloomPageSetDeleted(page) (BloomPageGetOpaque(page)->flags |= BLOOM_DELETED) |
||||
#define BloomPageSetNonDeleted(page) (BloomPageGetOpaque(page)->flags &= ~BLOOM_DELETED) |
||||
#define BloomPageGetData(page) ((BloomTuple *)PageGetContents(page)) |
||||
#define BloomPageGetTuple(state, page, offset) \ |
||||
((BloomTuple *)(PageGetContents(page) \
|
||||
+ (state)->sizeOfBloomTuple * ((offset) - 1))) |
||||
#define BloomPageGetNextTuple(state, tuple) \ |
||||
((BloomTuple *)((Pointer)(tuple) + (state)->sizeOfBloomTuple)) |
||||
|
||||
/* Preserved page numbers */ |
||||
#define BLOOM_METAPAGE_BLKNO (0) |
||||
#define BLOOM_HEAD_BLKNO (1) /* first data page */ |
||||
|
||||
/* Bloom index options */ |
||||
typedef struct BloomOptions |
||||
{ |
||||
int32 vl_len_; /* varlena header (do not touch directly!) */ |
||||
int bloomLength; /* length of signature in uint16 */ |
||||
int bitSize[INDEX_MAX_KEYS]; /* signature bits per index
|
||||
* key */ |
||||
} BloomOptions; |
||||
|
||||
/*
|
||||
* FreeBlockNumberArray - array of block numbers sized so that metadata fill |
||||
* all space in metapage. |
||||
*/ |
||||
typedef BlockNumber FreeBlockNumberArray[ |
||||
MAXALIGN_DOWN( |
||||
BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) |
||||
- MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) |
||||
) / sizeof(BlockNumber) |
||||
]; |
||||
|
||||
/* Metadata of bloom index */ |
||||
typedef struct BloomMetaPageData |
||||
{ |
||||
uint32 magickNumber; |
||||
uint16 nStart; |
||||
uint16 nEnd; |
||||
BloomOptions opts; |
||||
FreeBlockNumberArray notFullPage; |
||||
} BloomMetaPageData; |
||||
|
||||
/* Magic number to distinguish bloom pages among anothers */ |
||||
#define BLOOM_MAGICK_NUMBER (0xDBAC0DED) |
||||
|
||||
/* Number of blocks numbers fit in BloomMetaPageData */ |
||||
#define BloomMetaBlockN (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber)) |
||||
|
||||
#define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page)) |
||||
|
||||
typedef struct BloomState |
||||
{ |
||||
FmgrInfo hashFn[INDEX_MAX_KEYS]; |
||||
BloomOptions *opts; /* stored in rd_amcache and defined at
|
||||
* creation time */ |
||||
int32 nColumns; |
||||
|
||||
/*
|
||||
* sizeOfBloomTuple is index's specific, and it depends on reloptions, so |
||||
* precompute it |
||||
*/ |
||||
int32 sizeOfBloomTuple; |
||||
} BloomState; |
||||
|
||||
#define BloomPageGetFreeSpace(state, page) \ |
||||
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
|
||||
- BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \
|
||||
- MAXALIGN(sizeof(BloomPageOpaqueData))) |
||||
|
||||
/*
|
||||
* Tuples are very different from all other relations |
||||
*/ |
||||
typedef uint16 SignType; |
||||
|
||||
typedef struct BloomTuple |
||||
{ |
||||
ItemPointerData heapPtr; |
||||
SignType sign[1]; |
||||
} BloomTuple; |
||||
|
||||
#define BLOOMTUPLEHDRSZ offsetof(BloomTuple, sign) |
||||
|
||||
/* Opaque data structure for bloom index scan */ |
||||
typedef struct BloomScanOpaqueData |
||||
{ |
||||
SignType *sign; /* Scan signature */ |
||||
BloomState state; |
||||
} BloomScanOpaqueData; |
||||
|
||||
typedef BloomScanOpaqueData *BloomScanOpaque; |
||||
|
||||
/* blutils.c */ |
||||
extern void _PG_init(void); |
||||
extern Datum blhandler(PG_FUNCTION_ARGS); |
||||
extern void initBloomState(BloomState * state, Relation index); |
||||
extern void BloomInitMetapage(Relation index); |
||||
extern void BloomInitPage(Page page, uint16 flags); |
||||
extern Buffer BloomNewBuffer(Relation index); |
||||
extern void signValue(BloomState * state, SignType * sign, Datum value, int attno); |
||||
extern BloomTuple *BloomFormTuple(BloomState * state, ItemPointer iptr, Datum *values, bool *isnull); |
||||
extern bool BloomPageAddItem(BloomState * state, Page page, BloomTuple * tuple); |
||||
|
||||
/* blvalidate.c */ |
||||
extern bool blvalidate(Oid opclassoid); |
||||
|
||||
/* index access method interface functions */ |
||||
extern bool blinsert(Relation index, Datum *values, bool *isnull, |
||||
ItemPointer ht_ctid, Relation heapRel, |
||||
IndexUniqueCheck checkUnique); |
||||
extern IndexScanDesc blbeginscan(Relation r, int nkeys, int norderbys); |
||||
extern int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); |
||||
extern void blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
||||
ScanKey orderbys, int norderbys); |
||||
extern void blendscan(IndexScanDesc scan); |
||||
extern IndexBuildResult *blbuild(Relation heap, Relation index, |
||||
struct IndexInfo *indexInfo); |
||||
extern void blbuildempty(Relation index); |
||||
extern IndexBulkDeleteResult *blbulkdelete(IndexVacuumInfo *info, |
||||
IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, |
||||
void *callback_state); |
||||
extern IndexBulkDeleteResult *blvacuumcleanup(IndexVacuumInfo *info, |
||||
IndexBulkDeleteResult *stats); |
||||
extern bytea *bloptions(Datum reloptions, bool validate); |
||||
extern void blcostestimate(PlannerInfo *root, IndexPath *path, |
||||
double loop_count, Cost *indexStartupCost, |
||||
Cost *indexTotalCost, Selectivity *indexSelectivity, |
||||
double *indexCorrelation); |
||||
|
||||
#endif |
@ -0,0 +1,175 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blscan.c |
||||
* Bloom index scan functions. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blscan.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/relscan.h" |
||||
#include "pgstat.h" |
||||
#include "miscadmin.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/lmgr.h" |
||||
#include "utils/memutils.h" |
||||
#include "utils/rel.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
/*
|
||||
* Begin scan of bloom index. |
||||
*/ |
||||
IndexScanDesc |
||||
blbeginscan(Relation r, int nkeys, int norderbys) |
||||
{ |
||||
IndexScanDesc scan; |
||||
|
||||
scan = RelationGetIndexScan(r, nkeys, norderbys); |
||||
|
||||
return scan; |
||||
} |
||||
|
||||
/*
|
||||
* Rescan a bloom index. |
||||
*/ |
||||
void |
||||
blrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
||||
ScanKey orderbys, int norderbys) |
||||
{ |
||||
BloomScanOpaque so; |
||||
|
||||
so = (BloomScanOpaque) scan->opaque; |
||||
|
||||
if (so == NULL) |
||||
{ |
||||
/* if called from blbeginscan */ |
||||
so = (BloomScanOpaque) palloc(sizeof(BloomScanOpaqueData)); |
||||
initBloomState(&so->state, scan->indexRelation); |
||||
scan->opaque = so; |
||||
|
||||
} |
||||
else |
||||
{ |
||||
if (so->sign) |
||||
pfree(so->sign); |
||||
} |
||||
so->sign = NULL; |
||||
|
||||
if (scankey && scan->numberOfKeys > 0) |
||||
{ |
||||
memmove(scan->keyData, scankey, |
||||
scan->numberOfKeys * sizeof(ScanKeyData)); |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* End scan of bloom index. |
||||
*/ |
||||
void |
||||
blendscan(IndexScanDesc scan) |
||||
{ |
||||
BloomScanOpaque so = (BloomScanOpaque) scan->opaque; |
||||
|
||||
if (so->sign) |
||||
pfree(so->sign); |
||||
so->sign = NULL; |
||||
} |
||||
|
||||
/*
|
||||
* Insert all matching tuples into to a bitmap. |
||||
*/ |
||||
int64 |
||||
blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) |
||||
{ |
||||
int64 ntids = 0; |
||||
BlockNumber blkno = BLOOM_HEAD_BLKNO, |
||||
npages; |
||||
int i; |
||||
BufferAccessStrategy bas; |
||||
BloomScanOpaque so = (BloomScanOpaque) scan->opaque; |
||||
|
||||
if (so->sign == NULL && scan->numberOfKeys > 0) |
||||
{ |
||||
/* New search: have to calculate search signature */ |
||||
ScanKey skey = scan->keyData; |
||||
|
||||
so->sign = palloc0(sizeof(SignType) * so->state.opts->bloomLength); |
||||
|
||||
for (i = 0; i < scan->numberOfKeys; i++) |
||||
{ |
||||
/*
|
||||
* Assume bloom-indexable operators to be strict, so nothing could |
||||
* be found for NULL key. |
||||
*/ |
||||
if (skey->sk_flags & SK_ISNULL) |
||||
{ |
||||
pfree(so->sign); |
||||
so->sign = NULL; |
||||
return 0; |
||||
} |
||||
|
||||
/* Add next value to the signature */ |
||||
signValue(&so->state, so->sign, skey->sk_argument, |
||||
skey->sk_attno - 1); |
||||
|
||||
skey++; |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* We're going to read the whole index. This is why we use appropriate |
||||
* buffer access strategy. |
||||
*/ |
||||
bas = GetAccessStrategy(BAS_BULKREAD); |
||||
npages = RelationGetNumberOfBlocks(scan->indexRelation); |
||||
|
||||
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) |
||||
{ |
||||
Buffer buffer; |
||||
Page page; |
||||
|
||||
buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, |
||||
blkno, RBM_NORMAL, bas); |
||||
|
||||
LockBuffer(buffer, BUFFER_LOCK_SHARE); |
||||
page = BufferGetPage(buffer); |
||||
|
||||
if (!BloomPageIsDeleted(page)) |
||||
{ |
||||
OffsetNumber offset, |
||||
maxOffset = BloomPageGetMaxOffset(page); |
||||
|
||||
for (offset = 1; offset <= maxOffset; offset++) |
||||
{ |
||||
BloomTuple *itup = BloomPageGetTuple(&so->state, page, offset); |
||||
bool res = true; |
||||
|
||||
/* Check index signature with scan signature */ |
||||
for (i = 0; res && i < so->state.opts->bloomLength; i++) |
||||
{ |
||||
if ((itup->sign[i] & so->sign[i]) != so->sign[i]) |
||||
res = false; |
||||
} |
||||
|
||||
/* Add matching tuples to bitmap */ |
||||
if (res) |
||||
{ |
||||
tbm_add_tuples(tbm, &itup->heapPtr, 1, true); |
||||
ntids++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
UnlockReleaseBuffer(buffer); |
||||
CHECK_FOR_INTERRUPTS(); |
||||
} |
||||
FreeAccessStrategy(bas); |
||||
|
||||
return ntids; |
||||
} |
@ -0,0 +1,463 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blutils.c |
||||
* Bloom index utilities. |
||||
* |
||||
* Portions Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1990-1993, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blutils.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/amapi.h" |
||||
#include "access/generic_xlog.h" |
||||
#include "catalog/index.h" |
||||
#include "storage/lmgr.h" |
||||
#include "miscadmin.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/indexfsm.h" |
||||
#include "utils/memutils.h" |
||||
#include "access/reloptions.h" |
||||
#include "storage/freespace.h" |
||||
#include "storage/indexfsm.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
/* Signature dealing macros */ |
||||
#define BITSIGNTYPE (BITS_PER_BYTE * sizeof(SignType)) |
||||
#define GETWORD(x,i) ( *( (SignType*)(x) + (int)( (i) / BITSIGNTYPE ) ) ) |
||||
#define CLRBIT(x,i) GETWORD(x,i) &= ~( 0x01 << ( (i) % BITSIGNTYPE ) ) |
||||
#define SETBIT(x,i) GETWORD(x,i) |= ( 0x01 << ( (i) % BITSIGNTYPE ) ) |
||||
#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % BITSIGNTYPE )) & 0x01 ) |
||||
|
||||
PG_FUNCTION_INFO_V1(blhandler); |
||||
|
||||
/* Kind of relation optioms for bloom index */ |
||||
static relopt_kind bl_relopt_kind; |
||||
|
||||
static int32 myRand(); |
||||
static void mySrand(uint32 seed); |
||||
|
||||
/*
|
||||
* Module initialize function: initilized relation options. |
||||
*/ |
||||
void |
||||
_PG_init(void) |
||||
{ |
||||
int i; |
||||
char buf[16]; |
||||
|
||||
bl_relopt_kind = add_reloption_kind(); |
||||
|
||||
add_int_reloption(bl_relopt_kind, "length", |
||||
"Length of signature in uint16 type", 5, 1, 256); |
||||
|
||||
for (i = 0; i < INDEX_MAX_KEYS; i++) |
||||
{ |
||||
snprintf(buf, 16, "col%d", i + 1); |
||||
add_int_reloption(bl_relopt_kind, buf, |
||||
"Number of bits for corresponding column", 2, 1, 2048); |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Bloom handler function: return IndexAmRoutine with access method parameters |
||||
* and callbacks. |
||||
*/ |
||||
Datum |
||||
blhandler(PG_FUNCTION_ARGS) |
||||
{ |
||||
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
||||
|
||||
amroutine->amstrategies = 1; |
||||
amroutine->amsupport = 1; |
||||
amroutine->amcanorder = false; |
||||
amroutine->amcanorderbyop = false; |
||||
amroutine->amcanbackward = false; |
||||
amroutine->amcanunique = false; |
||||
amroutine->amcanmulticol = true; |
||||
amroutine->amoptionalkey = true; |
||||
amroutine->amsearcharray = false; |
||||
amroutine->amsearchnulls = false; |
||||
amroutine->amstorage = false; |
||||
amroutine->amclusterable = false; |
||||
amroutine->ampredlocks = false; |
||||
amroutine->amkeytype = 0; |
||||
|
||||
amroutine->aminsert = blinsert; |
||||
amroutine->ambeginscan = blbeginscan; |
||||
amroutine->amgettuple = NULL; |
||||
amroutine->amgetbitmap = blgetbitmap; |
||||
amroutine->amrescan = blrescan; |
||||
amroutine->amendscan = blendscan; |
||||
amroutine->ammarkpos = NULL; |
||||
amroutine->amrestrpos = NULL; |
||||
amroutine->ambuild = blbuild; |
||||
amroutine->ambuildempty = blbuildempty; |
||||
amroutine->ambulkdelete = blbulkdelete; |
||||
amroutine->amvacuumcleanup = blvacuumcleanup; |
||||
amroutine->amcanreturn = NULL; |
||||
amroutine->amcostestimate = blcostestimate; |
||||
amroutine->amoptions = bloptions; |
||||
amroutine->amvalidate = blvalidate; |
||||
|
||||
PG_RETURN_POINTER(amroutine); |
||||
} |
||||
|
||||
/*
|
||||
* Fill BloomState structure for particular index. |
||||
*/ |
||||
void |
||||
initBloomState(BloomState *state, Relation index) |
||||
{ |
||||
int i; |
||||
|
||||
state->nColumns = index->rd_att->natts; |
||||
|
||||
/* Initialize hash function for each attribute */ |
||||
for (i = 0; i < index->rd_att->natts; i++) |
||||
{ |
||||
fmgr_info_copy(&(state->hashFn[i]), |
||||
index_getprocinfo(index, i + 1, BLOOM_HASH_PROC), |
||||
CurrentMemoryContext); |
||||
} |
||||
|
||||
/* Initialize amcache if needed with options from metapage */ |
||||
if (!index->rd_amcache) |
||||
{ |
||||
Buffer buffer; |
||||
Page page; |
||||
BloomMetaPageData *meta; |
||||
BloomOptions *opts; |
||||
|
||||
opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions)); |
||||
|
||||
buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); |
||||
LockBuffer(buffer, BUFFER_LOCK_SHARE); |
||||
|
||||
page = BufferGetPage(buffer); |
||||
|
||||
if (!BloomPageIsMeta(page)) |
||||
elog(ERROR, "Relation is not a bloom index"); |
||||
meta = BloomPageGetMeta(BufferGetPage(buffer)); |
||||
|
||||
if (meta->magickNumber != BLOOM_MAGICK_NUMBER) |
||||
elog(ERROR, "Relation is not a bloom index"); |
||||
|
||||
*opts = meta->opts; |
||||
|
||||
UnlockReleaseBuffer(buffer); |
||||
|
||||
index->rd_amcache = (void *) opts; |
||||
} |
||||
|
||||
state->opts = (BloomOptions *) index->rd_amcache; |
||||
state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ + |
||||
sizeof(SignType) * state->opts->bloomLength; |
||||
} |
||||
|
||||
/*
|
||||
* Random generator copied from FreeBSD. Using own random generator here for |
||||
* two reasons: |
||||
* |
||||
* 1) In this case random numbers are used for on-disk storage. Usage of |
||||
* PostgreSQL number generator would obstruct it from all possible changes. |
||||
* 2) Changing seed of PostgreSQL random generator would be undesirable side |
||||
* effect. |
||||
*/ |
||||
static int32 next; |
||||
|
||||
static int32 |
||||
myRand() |
||||
{ |
||||
/*
|
||||
* Compute x = (7^5 * x) mod (2^31 - 1) |
||||
* without overflowing 31 bits: |
||||
* (2^31 - 1) = 127773 * (7^5) + 2836 |
||||
* From "Random number generators: good ones are hard to find", |
||||
* Park and Miller, Communications of the ACM, vol. 31, no. 10, |
||||
* October 1988, p. 1195. |
||||
*/ |
||||
int32 hi, lo, x; |
||||
|
||||
/* Must be in [1, 0x7ffffffe] range at this point. */ |
||||
hi = next / 127773; |
||||
lo = next % 127773; |
||||
x = 16807 * lo - 2836 * hi; |
||||
if (x < 0) |
||||
x += 0x7fffffff; |
||||
next = x; |
||||
/* Transform to [0, 0x7ffffffd] range. */ |
||||
return (x - 1); |
||||
} |
||||
|
||||
void |
||||
mySrand(uint32 seed) |
||||
{ |
||||
next = seed; |
||||
/* Transform to [1, 0x7ffffffe] range. */ |
||||
next = (next % 0x7ffffffe) + 1; |
||||
} |
||||
|
||||
/*
|
||||
* Add bits of given value to the signature. |
||||
*/ |
||||
void |
||||
signValue(BloomState *state, SignType *sign, Datum value, int attno) |
||||
{ |
||||
uint32 hashVal; |
||||
int nBit, |
||||
j; |
||||
|
||||
/*
|
||||
* init generator with "column's" number to get "hashed" seed for new |
||||
* value. We don't want to map the same numbers from different columns |
||||
* into the same bits! |
||||
*/ |
||||
mySrand(attno); |
||||
|
||||
/*
|
||||
* Init hash sequence to map our value into bits. the same values in |
||||
* different columns will be mapped into different bits because of step |
||||
* above |
||||
*/ |
||||
hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value)); |
||||
mySrand(hashVal ^ myRand()); |
||||
|
||||
for (j = 0; j < state->opts->bitSize[attno]; j++) |
||||
{ |
||||
/* prevent mutiple evaluation */ |
||||
nBit = myRand() % (state->opts->bloomLength * BITSIGNTYPE); |
||||
SETBIT(sign, nBit); |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Make bloom tuple from values. |
||||
*/ |
||||
BloomTuple * |
||||
BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull) |
||||
{ |
||||
int i; |
||||
BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple); |
||||
|
||||
res->heapPtr = *iptr; |
||||
|
||||
/* Blooming each column */ |
||||
for (i = 0; i < state->nColumns; i++) |
||||
{ |
||||
/* skip nulls */ |
||||
if (isnull[i]) |
||||
continue; |
||||
|
||||
signValue(state, res->sign, values[i], i); |
||||
} |
||||
|
||||
return res; |
||||
} |
||||
|
||||
/*
|
||||
* Add new bloom tuple to the page. Returns true if new tuple was successfully |
||||
* added to the page. Returns false if it doesn't git the page. |
||||
*/ |
||||
bool |
||||
BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple) |
||||
{ |
||||
BloomTuple *itup; |
||||
BloomPageOpaque opaque; |
||||
Pointer ptr; |
||||
|
||||
/* Does new tuple fit the page */ |
||||
if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple) |
||||
return false; |
||||
|
||||
/* Copy new tuple to the end of page */ |
||||
opaque = BloomPageGetOpaque(page); |
||||
itup = BloomPageGetTuple(state, page, opaque->maxoff + 1); |
||||
memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple); |
||||
|
||||
/* Adjust maxoff and pd_lower */ |
||||
opaque->maxoff++; |
||||
ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1); |
||||
((PageHeader) page)->pd_lower = ptr - page; |
||||
|
||||
return true; |
||||
} |
||||
|
||||
/*
|
||||
* Allocate a new page (either by recycling, or by extending the index file) |
||||
* The returned buffer is already pinned and exclusive-locked |
||||
* Caller is responsible for initializing the page by calling BloomInitBuffer |
||||
*/ |
||||
Buffer |
||||
BloomNewBuffer(Relation index) |
||||
{ |
||||
Buffer buffer; |
||||
bool needLock; |
||||
|
||||
/* First, try to get a page from FSM */ |
||||
for (;;) |
||||
{ |
||||
BlockNumber blkno = GetFreeIndexPage(index); |
||||
|
||||
if (blkno == InvalidBlockNumber) |
||||
break; |
||||
|
||||
buffer = ReadBuffer(index, blkno); |
||||
|
||||
/*
|
||||
* We have to guard against the possibility that someone else already |
||||
* recycled this page; the buffer may be locked if so. |
||||
*/ |
||||
if (ConditionalLockBuffer(buffer)) |
||||
{ |
||||
Page page = BufferGetPage(buffer); |
||||
|
||||
if (PageIsNew(page)) |
||||
return buffer; /* OK to use, if never initialized */ |
||||
|
||||
if (BloomPageIsDeleted(page)) |
||||
return buffer; /* OK to use */ |
||||
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
||||
} |
||||
|
||||
/* Can't use it, so release buffer and try again */ |
||||
ReleaseBuffer(buffer); |
||||
} |
||||
|
||||
/* Must extend the file */ |
||||
needLock = !RELATION_IS_LOCAL(index); |
||||
if (needLock) |
||||
LockRelationForExtension(index, ExclusiveLock); |
||||
|
||||
buffer = ReadBuffer(index, P_NEW); |
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
|
||||
if (needLock) |
||||
UnlockRelationForExtension(index, ExclusiveLock); |
||||
|
||||
return buffer; |
||||
} |
||||
|
||||
/*
|
||||
* Initialize bloom page. |
||||
*/ |
||||
void |
||||
BloomInitPage(Page page, uint16 flags) |
||||
{ |
||||
BloomPageOpaque opaque; |
||||
|
||||
PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData)); |
||||
|
||||
opaque = BloomPageGetOpaque(page); |
||||
memset(opaque, 0, sizeof(BloomPageOpaqueData)); |
||||
opaque->flags = flags; |
||||
} |
||||
|
||||
/*
|
||||
* Adjust options of bloom index. |
||||
*/ |
||||
static void |
||||
adjustBloomOptions(BloomOptions *opts) |
||||
{ |
||||
int i; |
||||
|
||||
/* Default length of bloom filter is 5 of 16-bit integers */ |
||||
if (opts->bloomLength <= 0) |
||||
opts->bloomLength = 5; |
||||
else |
||||
opts->bloomLength = opts->bloomLength; |
||||
|
||||
/* Check singnature length */ |
||||
for (i = 0; i < INDEX_MAX_KEYS; i++) |
||||
{ |
||||
/*
|
||||
* Zero and negative number of bits is meaningless. Also setting |
||||
* more bits than signature have seems useless. Replace both cases |
||||
* with 2 bits default. |
||||
*/ |
||||
if (opts->bitSize[i] <= 0 |
||||
|| opts->bitSize[i] >= opts->bloomLength * sizeof(SignType)) |
||||
opts->bitSize[i] = 2; |
||||
} |
||||
} |
||||
|
||||
/*
|
||||
* Initialize metapage for bloom index. |
||||
*/ |
||||
void |
||||
BloomInitMetapage(Relation index) |
||||
{ |
||||
Page metaPage; |
||||
Buffer metaBuffer; |
||||
BloomMetaPageData *metadata; |
||||
GenericXLogState *state; |
||||
|
||||
/*
|
||||
* Make a new buffer, since it first buffer it should be associated with |
||||
* block number 0 (BLOOM_METAPAGE_BLKNO). |
||||
*/ |
||||
metaBuffer = BloomNewBuffer(index); |
||||
Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO); |
||||
|
||||
/* Initialize bloom index options */ |
||||
if (!index->rd_options) |
||||
index->rd_options = palloc0(sizeof(BloomOptions)); |
||||
adjustBloomOptions((BloomOptions *) index->rd_options); |
||||
|
||||
/* Initialize contents of meta page */ |
||||
state = GenericXLogStart(index); |
||||
metaPage = GenericXLogRegister(state, metaBuffer, true); |
||||
|
||||
BloomInitPage(metaPage, BLOOM_META); |
||||
metadata = BloomPageGetMeta(metaPage); |
||||
memset(metadata, 0, sizeof(BloomMetaPageData)); |
||||
metadata->magickNumber = BLOOM_MAGICK_NUMBER; |
||||
metadata->opts = *((BloomOptions *) index->rd_options); |
||||
((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData); |
||||
|
||||
GenericXLogFinish(state); |
||||
UnlockReleaseBuffer(metaBuffer); |
||||
} |
||||
|
||||
/*
|
||||
* Initialize options for bloom index. |
||||
*/ |
||||
bytea * |
||||
bloptions(Datum reloptions, bool validate) |
||||
{ |
||||
relopt_value *options; |
||||
int numoptions; |
||||
BloomOptions *rdopts; |
||||
relopt_parse_elt tab[INDEX_MAX_KEYS + 1]; |
||||
int i; |
||||
char buf[16]; |
||||
|
||||
/* Option for length of signature */ |
||||
tab[0].optname = "length"; |
||||
tab[0].opttype = RELOPT_TYPE_INT; |
||||
tab[0].offset = offsetof(BloomOptions, bloomLength); |
||||
|
||||
/* Number of bits for each of possible columns: col1, col2, ... */ |
||||
for (i = 0; i < INDEX_MAX_KEYS; i++) |
||||
{ |
||||
snprintf(buf, sizeof(buf), "col%d", i + 1); |
||||
tab[i + 1].optname = pstrdup(buf); |
||||
tab[i + 1].opttype = RELOPT_TYPE_INT; |
||||
tab[i + 1].offset = offsetof(BloomOptions, bitSize[i]); |
||||
} |
||||
|
||||
options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions); |
||||
rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions); |
||||
fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions, |
||||
validate, tab, INDEX_MAX_KEYS + 1); |
||||
|
||||
adjustBloomOptions(rdopts); |
||||
|
||||
return (bytea *) rdopts; |
||||
} |
@ -0,0 +1,212 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blvacuum.c |
||||
* Bloom VACUUM functions. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blvacuum.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/genam.h" |
||||
#include "catalog/storage.h" |
||||
#include "commands/vacuum.h" |
||||
#include "miscadmin.h" |
||||
#include "postmaster/autovacuum.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "storage/indexfsm.h" |
||||
#include "storage/lmgr.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
/*
|
||||
* Bulk deletion of all index entries pointing to a set of heap tuples. |
||||
* The set of target tuples is specified via a callback routine that tells |
||||
* whether any given heap tuple (identified by ItemPointer) is being deleted. |
||||
* |
||||
* Result: a palloc'd struct containing statistical info for VACUUM displays. |
||||
*/ |
||||
IndexBulkDeleteResult * |
||||
blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
||||
IndexBulkDeleteCallback callback, void *callback_state) |
||||
{ |
||||
Relation index = info->index; |
||||
BlockNumber blkno, |
||||
npages; |
||||
FreeBlockNumberArray notFullPage; |
||||
int countPage = 0; |
||||
BloomState state; |
||||
Buffer buffer; |
||||
Page page; |
||||
GenericXLogState *gxlogState; |
||||
|
||||
if (stats == NULL) |
||||
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
||||
|
||||
initBloomState(&state, index); |
||||
|
||||
/*
|
||||
* Interate over the pages. We don't care about concurrently added pages, |
||||
* they can't contain tuples to delete. |
||||
*/ |
||||
npages = RelationGetNumberOfBlocks(index); |
||||
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) |
||||
{ |
||||
BloomTuple *itup, |
||||
*itupPtr, |
||||
*itupEnd; |
||||
|
||||
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, |
||||
RBM_NORMAL, info->strategy); |
||||
|
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
gxlogState = GenericXLogStart(index); |
||||
page = GenericXLogRegister(gxlogState, buffer, false); |
||||
|
||||
if (BloomPageIsDeleted(page)) |
||||
{ |
||||
UnlockReleaseBuffer(buffer); |
||||
CHECK_FOR_INTERRUPTS(); |
||||
continue; |
||||
} |
||||
|
||||
/* Iterate over the tuples */ |
||||
itup = BloomPageGetTuple(&state, page, 1); |
||||
itupPtr = BloomPageGetTuple(&state, page, 1); |
||||
itupEnd = BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1); |
||||
while (itup < itupEnd) |
||||
{ |
||||
/* Do we have to delete this tuple? */ |
||||
if (callback(&itup->heapPtr, callback_state)) |
||||
{ |
||||
stats->tuples_removed += 1; |
||||
BloomPageGetOpaque(page)->maxoff--; |
||||
} |
||||
else |
||||
{ |
||||
if (itupPtr != itup) |
||||
{ |
||||
/*
|
||||
* If we already delete something before, we have to move |
||||
* this tuple backward. |
||||
*/ |
||||
memmove((Pointer) itupPtr, (Pointer) itup, |
||||
state.sizeOfBloomTuple); |
||||
} |
||||
stats->num_index_tuples++; |
||||
itupPtr = BloomPageGetNextTuple(&state, itupPtr); |
||||
} |
||||
|
||||
itup = BloomPageGetNextTuple(&state, itup); |
||||
} |
||||
|
||||
Assert(itupPtr == BloomPageGetTuple(&state, page, BloomPageGetMaxOffset(page) + 1)); |
||||
|
||||
if (!BloomPageIsDeleted(page) && |
||||
BloomPageGetFreeSpace(&state, page) > state.sizeOfBloomTuple && |
||||
countPage < BloomMetaBlockN) |
||||
notFullPage[countPage++] = blkno; |
||||
|
||||
/* Did we delete something? */ |
||||
if (itupPtr != itup) |
||||
{ |
||||
/* Is it empty page now? */ |
||||
if (itupPtr == BloomPageGetData(page)) |
||||
BloomPageSetDeleted(page); |
||||
/* Adjust pg_lower */ |
||||
((PageHeader) page)->pd_lower = (Pointer) itupPtr - page; |
||||
/* Finish WAL-logging */ |
||||
GenericXLogFinish(gxlogState); |
||||
} |
||||
else |
||||
{ |
||||
/* Didn't change anything: abort WAL-logging */ |
||||
GenericXLogAbort(gxlogState); |
||||
} |
||||
UnlockReleaseBuffer(buffer); |
||||
CHECK_FOR_INTERRUPTS(); |
||||
} |
||||
|
||||
if (countPage > 0) |
||||
{ |
||||
BloomMetaPageData *metaData; |
||||
|
||||
buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); |
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
||||
|
||||
gxlogState = GenericXLogStart(index); |
||||
page = GenericXLogRegister(gxlogState, buffer, false); |
||||
|
||||
metaData = BloomPageGetMeta(page); |
||||
memcpy(metaData->notFullPage, notFullPage, sizeof(FreeBlockNumberArray)); |
||||
metaData->nStart = 0; |
||||
metaData->nEnd = countPage; |
||||
|
||||
GenericXLogFinish(gxlogState); |
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
|
||||
return stats; |
||||
} |
||||
|
||||
/*
|
||||
* Post-VACUUM cleanup. |
||||
* |
||||
* Result: a palloc'd struct containing statistical info for VACUUM displays. |
||||
*/ |
||||
IndexBulkDeleteResult * |
||||
blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
||||
{ |
||||
Relation index = info->index; |
||||
BlockNumber npages, |
||||
blkno; |
||||
BlockNumber totFreePages; |
||||
|
||||
if (info->analyze_only) |
||||
return stats; |
||||
|
||||
if (stats == NULL) |
||||
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
||||
|
||||
/*
|
||||
* Iterate over the pages: insert deleted pages into FSM and collect |
||||
* statistics. |
||||
*/ |
||||
npages = RelationGetNumberOfBlocks(index); |
||||
totFreePages = 0; |
||||
for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) |
||||
{ |
||||
Buffer buffer; |
||||
Page page; |
||||
|
||||
vacuum_delay_point(); |
||||
|
||||
buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, |
||||
RBM_NORMAL, info->strategy); |
||||
LockBuffer(buffer, BUFFER_LOCK_SHARE); |
||||
page = (Page) BufferGetPage(buffer); |
||||
|
||||
if (BloomPageIsDeleted(page)) |
||||
{ |
||||
RecordFreeIndexPage(index, blkno); |
||||
totFreePages++; |
||||
} |
||||
else |
||||
{ |
||||
stats->num_index_tuples += BloomPageGetMaxOffset(page); |
||||
stats->estimated_count += BloomPageGetMaxOffset(page); |
||||
} |
||||
|
||||
UnlockReleaseBuffer(buffer); |
||||
} |
||||
|
||||
IndexFreeSpaceMapVacuum(info->index); |
||||
stats->pages_free = totFreePages; |
||||
stats->num_pages = RelationGetNumberOfBlocks(index); |
||||
|
||||
return stats; |
||||
} |
@ -0,0 +1,220 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* blvalidate.c |
||||
* Opclass validator for bloom. |
||||
* |
||||
* Copyright (c) 2016, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* contrib/bloom/blvalidate.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/amvalidate.h" |
||||
#include "access/htup_details.h" |
||||
#include "catalog/pg_amop.h" |
||||
#include "catalog/pg_amproc.h" |
||||
#include "catalog/pg_opclass.h" |
||||
#include "catalog/pg_opfamily.h" |
||||
#include "catalog/pg_type.h" |
||||
#include "utils/builtins.h" |
||||
#include "utils/lsyscache.h" |
||||
#include "utils/syscache.h" |
||||
|
||||
#include "bloom.h" |
||||
|
||||
/*
|
||||
* Validator for a bloom opclass. |
||||
*/ |
||||
bool |
||||
blvalidate(Oid opclassoid) |
||||
{ |
||||
bool result = true; |
||||
HeapTuple classtup; |
||||
Form_pg_opclass classform; |
||||
Oid opfamilyoid; |
||||
Oid opcintype; |
||||
Oid opckeytype; |
||||
char *opclassname; |
||||
HeapTuple familytup; |
||||
Form_pg_opfamily familyform; |
||||
char *opfamilyname; |
||||
CatCList *proclist, |
||||
*oprlist; |
||||
List *grouplist; |
||||
OpFamilyOpFuncGroup *opclassgroup; |
||||
int i; |
||||
ListCell *lc; |
||||
|
||||
/* Fetch opclass information */ |
||||
classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); |
||||
if (!HeapTupleIsValid(classtup)) |
||||
elog(ERROR, "cache lookup failed for operator class %u", opclassoid); |
||||
classform = (Form_pg_opclass) GETSTRUCT(classtup); |
||||
|
||||
opfamilyoid = classform->opcfamily; |
||||
opcintype = classform->opcintype; |
||||
opckeytype = classform->opckeytype; |
||||
if (!OidIsValid(opckeytype)) |
||||
opckeytype = opcintype; |
||||
opclassname = NameStr(classform->opcname); |
||||
|
||||
/* Fetch opfamily information */ |
||||
familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); |
||||
if (!HeapTupleIsValid(familytup)) |
||||
elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); |
||||
familyform = (Form_pg_opfamily) GETSTRUCT(familytup); |
||||
|
||||
opfamilyname = NameStr(familyform->opfname); |
||||
|
||||
/* Fetch all operators and support functions of the opfamily */ |
||||
oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); |
||||
proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); |
||||
|
||||
/* Check individual support functions */ |
||||
for (i = 0; i < proclist->n_members; i++) |
||||
{ |
||||
HeapTuple proctup = &proclist->members[i]->tuple; |
||||
Form_pg_amproc procform = (Form_pg_amproc) GETSTRUCT(proctup); |
||||
bool ok; |
||||
|
||||
/*
|
||||
* All bloom support functions should be registered with matching |
||||
* left/right types |
||||
*/ |
||||
if (procform->amproclefttype != procform->amprocrighttype) |
||||
{ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opfamily %s contains support procedure %s with cross-type registration", |
||||
opfamilyname, |
||||
format_procedure(procform->amproc)))); |
||||
result = false; |
||||
} |
||||
|
||||
/*
|
||||
* We can't check signatures except within the specific opclass, since |
||||
* we need to know the associated opckeytype in many cases. |
||||
*/ |
||||
if (procform->amproclefttype != opcintype) |
||||
continue; |
||||
|
||||
/* Check procedure numbers and function signatures */ |
||||
switch (procform->amprocnum) |
||||
{ |
||||
case BLOOM_HASH_PROC: |
||||
ok = check_amproc_signature(procform->amproc, INT4OID, false, |
||||
1, 1, opckeytype); |
||||
break; |
||||
default: |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opfamily %s contains function %s with invalid support number %d", |
||||
opfamilyname, |
||||
format_procedure(procform->amproc), |
||||
procform->amprocnum))); |
||||
result = false; |
||||
continue; /* don't want additional message */ |
||||
} |
||||
|
||||
if (!ok) |
||||
{ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("gist opfamily %s contains function %s with wrong signature for support number %d", |
||||
opfamilyname, |
||||
format_procedure(procform->amproc), |
||||
procform->amprocnum))); |
||||
result = false; |
||||
} |
||||
} |
||||
|
||||
/* Check individual operators */ |
||||
for (i = 0; i < oprlist->n_members; i++) |
||||
{ |
||||
HeapTuple oprtup = &oprlist->members[i]->tuple; |
||||
Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); |
||||
|
||||
/* Check it's allowed strategy for bloom */ |
||||
if (oprform->amopstrategy < 1 || |
||||
oprform->amopstrategy > BLOOM_NSTRATEGIES) |
||||
{ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opfamily %s contains operator %s with invalid strategy number %d", |
||||
opfamilyname, |
||||
format_operator(oprform->amopopr), |
||||
oprform->amopstrategy))); |
||||
result = false; |
||||
} |
||||
|
||||
/* bloom doesn't support ORDER BY operators */ |
||||
if (oprform->amoppurpose != AMOP_SEARCH || |
||||
OidIsValid(oprform->amopsortfamily)) |
||||
{ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opfamily %s contains invalid ORDER BY specification for operator %s", |
||||
opfamilyname, |
||||
format_operator(oprform->amopopr)))); |
||||
result = false; |
||||
} |
||||
|
||||
/* Check operator signature --- same for all bloom strategies */ |
||||
if (!check_amop_signature(oprform->amopopr, BOOLOID, |
||||
oprform->amoplefttype, |
||||
oprform->amoprighttype)) |
||||
{ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opfamily %s contains operator %s with wrong signature", |
||||
opfamilyname, |
||||
format_operator(oprform->amopopr)))); |
||||
result = false; |
||||
} |
||||
} |
||||
|
||||
/* Now check for inconsistent groups of operators/functions */ |
||||
grouplist = identify_opfamily_groups(oprlist, proclist); |
||||
opclassgroup = NULL; |
||||
foreach(lc, grouplist) |
||||
{ |
||||
OpFamilyOpFuncGroup *thisgroup = (OpFamilyOpFuncGroup *) lfirst(lc); |
||||
|
||||
/* Remember the group exactly matching the test opclass */ |
||||
if (thisgroup->lefttype == opcintype && |
||||
thisgroup->righttype == opcintype) |
||||
opclassgroup = thisgroup; |
||||
|
||||
/*
|
||||
* There is not a lot we can do to check the operator sets, since each |
||||
* bloom opclass is more or less a law unto itself, and some contain |
||||
* only operators that are binary-compatible with the opclass datatype |
||||
* (meaning that empty operator sets can be OK). That case also means |
||||
* that we shouldn't insist on nonempty function sets except for the |
||||
* opclass's own group. |
||||
*/ |
||||
} |
||||
|
||||
/* Check that the originally-named opclass is complete */ |
||||
for (i = 1; i <= BLOOM_NPROC; i++) |
||||
{ |
||||
if (opclassgroup && |
||||
(opclassgroup->functionset & (((uint64) 1) << i)) != 0) |
||||
continue; /* got it */ |
||||
ereport(INFO, |
||||
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION), |
||||
errmsg("bloom opclass %s is missing support function %d", |
||||
opclassname, i))); |
||||
result = false; |
||||
} |
||||
|
||||
ReleaseCatCacheList(proclist); |
||||
ReleaseCatCacheList(oprlist); |
||||
ReleaseSysCache(familytup); |
||||
ReleaseSysCache(classtup); |
||||
|
||||
return result; |
||||
} |
@ -0,0 +1,122 @@ |
||||
CREATE EXTENSION bloom; |
||||
CREATE TABLE tst ( |
||||
i int4, |
||||
t text |
||||
); |
||||
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; |
||||
CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); |
||||
SET enable_seqscan=on; |
||||
SET enable_bitmapscan=off; |
||||
SET enable_indexscan=off; |
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
count |
||||
------- |
||||
10000 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
count |
||||
------- |
||||
6264 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
count |
||||
------- |
||||
588 |
||||
(1 row) |
||||
|
||||
SET enable_seqscan=off; |
||||
SET enable_bitmapscan=on; |
||||
SET enable_indexscan=on; |
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; |
||||
QUERY PLAN |
||||
------------------------------------------- |
||||
Aggregate |
||||
-> Bitmap Heap Scan on tst |
||||
Recheck Cond: (i = 7) |
||||
-> Bitmap Index Scan on bloomidx |
||||
Index Cond: (i = 7) |
||||
(5 rows) |
||||
|
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; |
||||
QUERY PLAN |
||||
------------------------------------------- |
||||
Aggregate |
||||
-> Bitmap Heap Scan on tst |
||||
Recheck Cond: (t = '5'::text) |
||||
-> Bitmap Index Scan on bloomidx |
||||
Index Cond: (t = '5'::text) |
||||
(5 rows) |
||||
|
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
QUERY PLAN |
||||
--------------------------------------------------------- |
||||
Aggregate |
||||
-> Bitmap Heap Scan on tst |
||||
Recheck Cond: ((i = 7) AND (t = '5'::text)) |
||||
-> Bitmap Index Scan on bloomidx |
||||
Index Cond: ((i = 7) AND (t = '5'::text)) |
||||
(5 rows) |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
count |
||||
------- |
||||
10000 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
count |
||||
------- |
||||
6264 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
count |
||||
------- |
||||
588 |
||||
(1 row) |
||||
|
||||
DELETE FROM tst; |
||||
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; |
||||
VACUUM ANALYZE tst; |
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
count |
||||
------- |
||||
10000 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
count |
||||
------- |
||||
6264 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
count |
||||
------- |
||||
588 |
||||
(1 row) |
||||
|
||||
VACUUM FULL tst; |
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
count |
||||
------- |
||||
10000 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
count |
||||
------- |
||||
6264 |
||||
(1 row) |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
count |
||||
------- |
||||
588 |
||||
(1 row) |
||||
|
||||
RESET enable_seqscan; |
||||
RESET enable_bitmapscan; |
||||
RESET enable_indexscan; |
@ -0,0 +1,47 @@ |
||||
CREATE EXTENSION bloom; |
||||
|
||||
CREATE TABLE tst ( |
||||
i int4, |
||||
t text |
||||
); |
||||
|
||||
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; |
||||
CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3); |
||||
|
||||
SET enable_seqscan=on; |
||||
SET enable_bitmapscan=off; |
||||
SET enable_indexscan=off; |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
|
||||
SET enable_seqscan=off; |
||||
SET enable_bitmapscan=on; |
||||
SET enable_indexscan=on; |
||||
|
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7; |
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE t = '5'; |
||||
EXPLAIN (COSTS OFF) SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
|
||||
DELETE FROM tst; |
||||
INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i; |
||||
VACUUM ANALYZE tst; |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
|
||||
VACUUM FULL tst; |
||||
|
||||
SELECT count(*) FROM tst WHERE i = 7; |
||||
SELECT count(*) FROM tst WHERE t = '5'; |
||||
SELECT count(*) FROM tst WHERE i = 7 AND t = '5'; |
||||
|
||||
RESET enable_seqscan; |
||||
RESET enable_bitmapscan; |
||||
RESET enable_indexscan; |
@ -0,0 +1,75 @@ |
||||
# Test generic xlog record work for bloom index replication. |
||||
use strict; |
||||
use warnings; |
||||
use PostgresNode; |
||||
use TestLib; |
||||
use Test::More tests => 31; |
||||
|
||||
my $node_master; |
||||
my $node_standby; |
||||
|
||||
# Run few queries on both master and standby and check their results match. |
||||
sub test_index_replay |
||||
{ |
||||
my ($test_name) = @_; |
||||
|
||||
# Wait for standby to catch up |
||||
my $applname = $node_standby->name; |
||||
my $caughtup_query = |
||||
"SELECT pg_current_xlog_location() <= write_location FROM pg_stat_replication WHERE application_name = '$applname';"; |
||||
$node_master->poll_query_until('postgres', $caughtup_query) |
||||
or die "Timed out while waiting for standby 1 to catch up"; |
||||
|
||||
my $queries = qq(SET enable_seqscan=off; |
||||
SET enable_bitmapscan=on; |
||||
SET enable_indexscan=on; |
||||
SELECT * FROM tst WHERE i = 0; |
||||
SELECT * FROM tst WHERE i = 3; |
||||
SELECT * FROM tst WHERE t = 'b'; |
||||
SELECT * FROM tst WHERE t = 'f'; |
||||
SELECT * FROM tst WHERE i = 3 AND t = 'c'; |
||||
SELECT * FROM tst WHERE i = 7 AND t = 'e'; |
||||
); |
||||
|
||||
# Run test queries and compare their result |
||||
my $master_result = $node_master->psql("postgres", $queries); |
||||
my $standby_result = $node_standby->psql("postgres", $queries); |
||||
|
||||
is($master_result, $standby_result, "$test_name: query result matches"); |
||||
} |
||||
|
||||
# Initialize master node |
||||
$node_master = get_new_node('master'); |
||||
$node_master->init(allows_streaming => 1); |
||||
$node_master->start; |
||||
my $backup_name = 'my_backup'; |
||||
|
||||
# Take backup |
||||
$node_master->backup($backup_name); |
||||
|
||||
# Create streaming standby linking to master |
||||
$node_standby = get_new_node('standby'); |
||||
$node_standby->init_from_backup($node_master, $backup_name, |
||||
has_streaming => 1); |
||||
$node_standby->start; |
||||
|
||||
# Create some bloom index on master |
||||
$node_master->psql("postgres", "CREATE EXTENSION bloom;"); |
||||
$node_master->psql("postgres", "CREATE TABLE tst (i int4, t text);"); |
||||
$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;"); |
||||
$node_master->psql("postgres", "CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);"); |
||||
|
||||
# Test that queries give same result |
||||
test_index_replay('initial'); |
||||
|
||||
# Run 10 cycles of table modification. Run test queries after each modification. |
||||
for my $i (1..10) |
||||
{ |
||||
$node_master->psql("postgres", "DELETE FROM tst WHERE i = $i;"); |
||||
test_index_replay("delete $i"); |
||||
$node_master->psql("postgres", "VACUUM tst;"); |
||||
test_index_replay("vacuum $i"); |
||||
my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); |
||||
$node_master->psql("postgres", "INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;"); |
||||
test_index_replay("insert $i"); |
||||
} |
@ -0,0 +1,218 @@ |
||||
<!-- doc/src/sgml/bloom.sgml --> |
||||
|
||||
<sect1 id="bloom" xreflabel="bloom"> |
||||
<title>bloom</title> |
||||
|
||||
<indexterm zone="bloom"> |
||||
<primary>bloom</primary> |
||||
</indexterm> |
||||
|
||||
<para> |
||||
<literal>bloom</> is a contrib which implements index access method. It comes |
||||
as example of custom access methods and generic WAL records usage. But it |
||||
is also useful itself. |
||||
</para> |
||||
|
||||
<sect2> |
||||
<title>Introduction</title> |
||||
|
||||
<para> |
||||
Implementation of |
||||
<ulink url="http://en.wikipedia.org/wiki/Bloom_filter">Bloom filter</ulink> |
||||
allows fast exclusion of non-candidate tuples. |
||||
Since signature is a lossy representation of all indexed attributes, |
||||
search results should be rechecked using heap information. |
||||
User can specify signature length (in uint16, default is 5) and the number of |
||||
bits, which can be setted, per attribute (1 < colN < 2048). |
||||
</para> |
||||
|
||||
<para> |
||||
This index is useful if table has many attributes and queries can include |
||||
their arbitary combinations. Traditional <literal>btree</> index is faster |
||||
than bloom index, but it'd require too many indexes to support all possible |
||||
queries, while one need only one bloom index. Bloom index supports only |
||||
equality comparison. Since it's a signature file, not a tree, it always |
||||
should be readed fully, but sequentially, so index search performance is |
||||
constant and doesn't depend on a query. |
||||
</para> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Parameters</title> |
||||
|
||||
<para> |
||||
<literal>bloom</> indexes accept following parameters in <literal>WITH</> |
||||
clause. |
||||
</para> |
||||
|
||||
<variablelist> |
||||
<varlistentry> |
||||
<term><literal>length</></term> |
||||
<listitem> |
||||
<para> |
||||
Length of signature in uint16 type values |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
</variablelist> |
||||
<variablelist> |
||||
<varlistentry> |
||||
<term><literal>col1 — col16</></term> |
||||
<listitem> |
||||
<para> |
||||
Number of bits for corresponding column |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
</variablelist> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Examples</title> |
||||
|
||||
<para> |
||||
Example of index definition is given below. |
||||
</para> |
||||
|
||||
<programlisting> |
||||
CREATE INDEX bloomidx ON tbloom(i1,i2,i3) |
||||
WITH (length=5, col1=2, col2=2, col3=4); |
||||
</programlisting> |
||||
|
||||
<para> |
||||
Here, we create bloom index with signature length 80 bits and attributes |
||||
i1, i2 mapped to 2 bits, attribute i3 - to 4 bits. |
||||
</para> |
||||
|
||||
<para> |
||||
Example of index definition and usage is given below. |
||||
</para> |
||||
|
||||
<programlisting> |
||||
CREATE TABLE tbloom AS |
||||
SELECT |
||||
random()::int as i1, |
||||
random()::int as i2, |
||||
random()::int as i3, |
||||
random()::int as i4, |
||||
random()::int as i5, |
||||
random()::int as i6, |
||||
random()::int as i7, |
||||
random()::int as i8, |
||||
random()::int as i9, |
||||
random()::int as i10, |
||||
random()::int as i11, |
||||
random()::int as i12, |
||||
random()::int as i13 |
||||
FROM |
||||
generate_series(1,1000); |
||||
CREATE INDEX bloomidx ON tbloom USING |
||||
bloom (i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12); |
||||
SELECT pg_relation_size('bloomidx'); |
||||
CREATE index btree_idx ON tbloom(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12); |
||||
SELECT pg_relation_size('btree_idx'); |
||||
</programlisting> |
||||
|
||||
<programlisting> |
||||
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; |
||||
QUERY PLAN |
||||
----------------------------------------------------------------------------------------------------------------- |
||||
Bitmap Heap Scan on tbloom (cost=1.50..5.52 rows=1 width=52) (actual time=0.057..0.057 rows=0 loops=1) |
||||
Recheck Cond: ((i2 = 20) AND (i10 = 15)) |
||||
-> Bitmap Index Scan on bloomidx (cost=0.00..1.50 rows=1 width=0) (actual time=0.041..0.041 rows=9 loops=1) |
||||
Index Cond: ((i2 = 20) AND (i10 = 15)) |
||||
Total runtime: 0.081 ms |
||||
(5 rows) |
||||
</programlisting> |
||||
|
||||
<para> |
||||
Seqscan is slow. |
||||
</para> |
||||
|
||||
<programlisting> |
||||
=# SET enable_bitmapscan = off; |
||||
=# SET enable_indexscan = off; |
||||
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; |
||||
QUERY PLAN |
||||
-------------------------------------------------------------------------------------------------- |
||||
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.162..0.162 rows=0 loops=1) |
||||
Filter: ((i2 = 20) AND (i10 = 15)) |
||||
Total runtime: 0.181 ms |
||||
(3 rows) |
||||
</programlisting> |
||||
|
||||
<para> |
||||
Btree index will be not used for this query. |
||||
</para> |
||||
|
||||
<programlisting> |
||||
=# DROP INDEX bloomidx; |
||||
=# CREATE INDEX btree_idx ON tbloom(i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12); |
||||
=# EXPLAIN ANALYZE SELECT * FROM tbloom WHERE i2 = 20 AND i10 = 15; |
||||
QUERY PLAN |
||||
-------------------------------------------------------------------------------------------------- |
||||
Seq Scan on tbloom (cost=0.00..25.00 rows=1 width=52) (actual time=0.210..0.210 rows=0 loops=1) |
||||
Filter: ((i2 = 20) AND (i10 = 15)) |
||||
Total runtime: 0.250 ms |
||||
(3 rows) |
||||
</programlisting> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Opclass interface</title> |
||||
|
||||
<para> |
||||
Bloom opclass interface is simple. It requires 1 supporting function: |
||||
hash function for indexing datatype. And it provides 1 search operator: |
||||
equality operator. The example below shows <literal>opclass</> definition |
||||
for <literal>text</> datatype. |
||||
</para> |
||||
|
||||
<programlisting> |
||||
CREATE OPERATOR CLASS text_ops |
||||
DEFAULT FOR TYPE text USING bloom AS |
||||
OPERATOR 1 =(text, text), |
||||
FUNCTION 1 hashtext(text); |
||||
</programlisting> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Limitation</title> |
||||
<para> |
||||
|
||||
<itemizedlist> |
||||
<listitem> |
||||
<para> |
||||
For now, only opclasses for <literal>int4</>, <literal>text</> comes |
||||
with contrib. However, users may define more of them. |
||||
</para> |
||||
</listitem> |
||||
|
||||
<listitem> |
||||
<para> |
||||
Only <literal>=</literal> operator is supported for search now. But it's |
||||
possible to add support of arrays with contains and intersection |
||||
operations in future. |
||||
</para> |
||||
</listitem> |
||||
</itemizedlist> |
||||
</para> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Authors</title> |
||||
|
||||
<para> |
||||
Teodor Sigaev <email>teodor@postgrespro.ru</email>, Postgres Professional, Moscow, Russia |
||||
</para> |
||||
|
||||
<para> |
||||
Alexander Korotkov <email>a.korotkov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia |
||||
</para> |
||||
|
||||
<para> |
||||
Oleg Bartunov <email>obartunov@postgrespro.ru</email>, Postgres Professional, Moscow, Russia |
||||
</para> |
||||
</sect2> |
||||
|
||||
</sect1> |
Loading…
Reference in new issue