commit Oleg and Teodor's RD-tree implementation ... this provides the


			
			
				REL7_1_STABLE
			
			
		
Marc G. Fournier 25 years ago
parent 0ad7db4be4
commit 1db943b3ca
  1. 69
      contrib/intarray/Makefile
  2. 64
      contrib/intarray/Makefile.703
  3. 81
      contrib/intarray/README.intarray
  4. 842
      contrib/intarray/_int.c
  5. 211
      contrib/intarray/_int.sql.in
  6. 104
      contrib/intarray/bench/bench.pl
  7. 73
      contrib/intarray/bench/create_test.pl
  8. 5000
      contrib/intarray/data/test__int.data
  9. 19
      contrib/intarray/expected/_int.out
  10. 15
      contrib/intarray/sql/_int.sql

@ -0,0 +1,69 @@
subdir = contrib/intarray
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
# override libdir to install shlib in contrib not main directory
libdir := $(libdir)/contrib
# shared library parameters
NAME= _int
SO_MAJOR_VERSION= 1
SO_MINOR_VERSION= 0
override CPPFLAGS += -I$(srcdir) -DPGSQL71
OBJS= _int.o
all: all-lib $(NAME).sql
# Shared library stuff
include $(top_srcdir)/src/Makefile.shlib
$(NAME).sql: $(NAME).sql.in
sed -e 's:MODULE_PATHNAME:$(libdir)/$(shlib):g' < $< > $@
.PHONY: submake
submake:
$(MAKE) -C $(top_builddir)/src/test/regress pg_regress
# against installed postmaster
installcheck: submake
@echo "'make installcheck' is not supported."
installcheck: submake
$(top_builddir)/src/test/regress/pg_regress _int
# in-tree test doesn't work yet (no way to install my shared library)
#check: all submake
# $(top_builddir)/src/test/regress/pg_regress --temp-install \
# --top-builddir=$(top_builddir) _int
check:
@echo "'make check' is not supported."
@echo "Do 'make install', then 'make installcheck' instead."
install: all installdirs install-lib
#$(INSTALL_DATA) $(srcdir)/README.$(NAME) $(docdir)/contrib
$(INSTALL_DATA) $(NAME).sql $(datadir)/contrib
installdirs:
$(mkinstalldirs) $(docdir)/contrib $(datadir)/contrib $(libdir)
uninstall: uninstall-lib
rm -f $(docdir)/contrib/README.$(NAME) $(datadir)/contrib/$(NAME).sql
clean distclean maintainer-clean: clean-lib
rm -f *.so y.tab.c y.tab.h $(OBJS) $(NAME).sql
# things created by various check targets
rm -rf results tmp_check log
rm -f regression.diffs regression.out regress.out run_check.out
ifeq ($(PORTNAME), win)
rm -f regress.def
endif
depend dep:
$(CC) -MM $(CFLAGS) *.c >depend
ifeq (depend,$(wildcard depend))
include depend
endif

@ -0,0 +1,64 @@
#-------------------------------------------------------------------------
#
# Makefile --
#
# Makefile for Enzyme Commission catalogue number type -- ec_code
#
#-------------------------------------------------------------------------
PGDIR = ../..
SRCDIR = $(PGDIR)/src
include $(SRCDIR)/Makefile.global
INCLUDE_OPT = -I ./ \
-I $(SRCDIR)/ \
-I $(SRCDIR)/include \
-I $(SRCDIR)/port/$(PORTNAME)
CFLAGS += $(INCLUDE_OPT) $(CFLAGS_SL)
MODNAME = _int
OBJFILES = $(MODNAME).o
SQLDEFS = $(MODNAME).sql
MODULE = $(MODNAME)$(DLSUFFIX)
MODDIR = $(LIBDIR)/modules
SQLDIR = $(LIBDIR)/sql
all: module sql
module: $(MODULE)
sql: $(SQLDEFS)
$(MODULE): $(OBJFILES)
$(CC) $(CFLAGS) -shared -o $@ $(OBJFILES)
install: $(MODULE) $(SQLDEFS) $(MODDIR) $(SQLDIR)
cp -p $(MODULE) $(MODDIR)/
strip $(MODDIR)/$(MODULE)
cp -p $(SQLDEFS) $(SQLDIR)/
$(MODDIR):
mkdir -p $@
$(SQLDIR):
mkdir -p $@
%.sql: %.sql.in
sed "s|MODULE_PATHNAME|$(MODDIR)/$(MODULE)|" < $< > $@
depend dep:
$(CC) -MM $(INCLUDE_OPT) *.c >depend
clean:
rm -f $(MODULE) $(SQLDEFS) *$(DLSUFFIX)
rm -f *~ *# *.b *.o *.output *.tab.h $(MODNAME)parse.h $(MODNAME)parse.c $(MODNAME)scan.c
ifeq (depend,$(wildcard depend))
include depend
endif

@ -0,0 +1,81 @@
This is an implementation of RD-tree data structure using GiST interface
of PostgreSQL. It has built-in lossy compression - must be declared
in index creation - with (islossy). Current implementation has index support
for one-dimensional array of int4's.
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
(oleg@sai.msu.su). See http://www.sai.msu.su/~megera/postgres/gist
for additional information.
INSTALLATION:
gmake
gmake install
-- load functions
psql <database> < _int.sql
REGRESSION TEST:
gmake installcheck
EXAMPLE USAGE:
create table message (mid int not null,sections int[]);
create table message_section_map (mid int not null,sid int not null);
-- create indices
CREATE unique index message_key on message ( mid );
CREATE unique index message_section_map_key2 on message_section_map (sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
-- select some messages with section in 1 OR 2 - OVERLAP operator
select message.mid from message where message.sections && '{1,2}';
-- select messages contains in sections 1 AND 2 - CONTAINS operator
select message.mid from message where message.sections @ '{1,2}';
-- the same, CONTAINED operator
select message.mid from message where '{1,2}' ~ message.sections;
BENCHMARK:
subdirectory bench contains benchmark suite.
cd ./bench
1. createdb TEST
2. psql TEST < ../_int.sql
3. ./create_test.pl | psql TEST
4. ./bench.pl - perl script to benchmark queries, supports OR, AND queries
with/without RD-Tree. Run script without arguments to
see availbale options.
a)test without RD-Tree (OR)
./bench.pl -d TEST -s 1,2 -v
b)test with RD-Tree
./bench.pl -d TEST -s 1,2 -v -r
BENCHMARKS:
Size of table <message>: 200000
Size of table <message_section_map>: 268538
Distribution of messages by sections:
section 0: 73899 messages
section 1: 16298 messages
section 50: 1241 messages
section 99: 705 messages
old - without RD-Tree support,
new - with RD-Tree
+----------+---------------+----------------+
|Search set|OR, time in sec|AND, time in sec|
| +-------+-------+--------+-------+
| | old | new | old | new |
+----------+-------+-------+--------+-------+
| 1| 1.427| 0.215| -| -|
+----------+-------+-------+--------+-------+
| 99| 1.029| 0.018| -| -|
+----------+-------+-------+--------+-------+
| 1,2| 1.829| 0.334| 5.654| 0.042|
+----------+-------+-------+--------+-------+
| 1,2,50,60| 2.057| 0.359| 5.044| 0.007|
+----------+-------+-------+--------+-------+

@ -0,0 +1,842 @@
/******************************************************************************
This file contains routines that can be bound to a Postgres backend and
called by the backend in the process of processing queries. The calling
format for these routines is dictated by Postgres architecture.
******************************************************************************/
#include <stdio.h>
#include <float.h>
#include <string.h>
#include "postgres.h"
#include "access/gist.h"
#include "access/itup.h"
#include "access/rtree.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
#define MAXNUMRANGE 100
#define max(a,b) ((a) > (b) ? (a) : (b))
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define abs(a) ((a) < (0) ? (-a) : (a))
#define ARRPTR(x) ( (int4 *) ARR_DATA_PTR(x) )
#ifdef PGSQL71
#define ARRSIZE(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
#else
#define ARRSIZE(x) getNitems( ARR_NDIM(x), ARR_DIMS(x))
#endif
#define NDIM 1
#define ARRISNULL(x) ( (x) ? ( ( ARR_NDIM(x) == NDIM ) ? ( ( ARRSIZE( x ) ) ? 0 : 1 ) : 1 ) : 1 )
#define SORT(x) if ( ARRSIZE( x ) > 1 ) isort( (void*)ARRPTR( x ), ARRSIZE( x ) );
#define PREPAREARR(x) \
if ( ARRSIZE( x ) > 1 ) {\
if ( isort( (void*)ARRPTR( x ), ARRSIZE( x ) ) )\
x = _int_unique( x );\
}
/*
#define GIST_DEBUG
#define GIST_QUERY_DEBUG
*/
#ifdef GIST_DEBUG
static void printarr ( ArrayType * a, int num ) {
char bbb[16384];
char *cur;
int l;
int *d;
d = ARRPTR( a );
*bbb = '\0';
cur = bbb;
for(l=0; l<min( num, ARRSIZE( a ));l++) {
sprintf(cur,"%d ", d[l] );
cur = strchr( cur, '\0' ) ;
}
elog(NOTICE, "\t\t%s", bbb);
}
#endif
/*
** usefull function
*/
bool isort( int *a, const int len );
ArrayType * new_intArrayType( int num );
ArrayType * copy_intArrayType( ArrayType * a );
ArrayType * resize_intArrayType( ArrayType * a, int num );
int internal_size( int *a, int len );
ArrayType * _int_unique( ArrayType * a );
/*
** GiST support methods
*/
bool g_int_consistent(GISTENTRY *entry, ArrayType *query, StrategyNumber strategy);
GISTENTRY * g_int_compress(GISTENTRY *entry);
GISTENTRY * g_int_decompress(GISTENTRY *entry);
float * g_int_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result);
GIST_SPLITVEC * g_int_picksplit(bytea *entryvec, GIST_SPLITVEC *v);
bool g_int_internal_consistent(ArrayType *key, ArrayType *query, StrategyNumber strategy);
ArrayType * g_int_union(bytea *entryvec, int *sizep);
bool * g_int_same(ArrayType *b1, ArrayType *b2, bool *result);
/*
** R-tree suport functions
*/
bool inner_int_contains(ArrayType *a, ArrayType *b);
bool inner_int_overlap(ArrayType *a, ArrayType *b);
ArrayType * inner_int_union(ArrayType *a, ArrayType *b);
ArrayType * inner_int_inter(ArrayType *a, ArrayType *b);
bool _int_different(ArrayType *a, ArrayType *b);
bool _int_same(ArrayType *a, ArrayType *b);
bool _int_contains(ArrayType *a, ArrayType *b);
bool _int_contained(ArrayType *a, ArrayType *b);
bool _int_overlap(ArrayType *a, ArrayType *b);
ArrayType * _int_union(ArrayType *a, ArrayType *b);
ArrayType * _int_inter(ArrayType *a, ArrayType *b);
void rt__int_size(ArrayType *a, float* sz);
/*****************************************************************************
* GiST functions
*****************************************************************************/
/*
** The GiST Consistent method for _intments
** Should return false if for all data items x below entry,
** the predicate x op query == FALSE, where op is the oper
** corresponding to strategy in the pg_amop table.
*/
bool
g_int_consistent(GISTENTRY *entry,
ArrayType *query,
StrategyNumber strategy)
{
/* sort query for fast search, key is already sorted */
if ( ARRISNULL( query ) ) return FALSE;
PREPAREARR( query );
/*
** if entry is not leaf, use g_int_internal_consistent,
** else use g_int_leaf_consistent
*/
return(g_int_internal_consistent((ArrayType *)(entry->pred), query, strategy));
}
/*
** The GiST Union method for _intments
** returns the minimal set that encloses all the entries in entryvec
*/
ArrayType *
g_int_union(bytea *entryvec, int *sizep)
{
int numranges, i;
ArrayType *out = (ArrayType *)NULL;
ArrayType *tmp;
numranges = (VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY);
tmp = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[0]).pred;
#ifdef GIST_DEBUG
elog(NOTICE, "union %d", numranges);
#endif
for (i = 1; i < numranges; i++) {
out = inner_int_union(tmp, (ArrayType *)
(((GISTENTRY *)(VARDATA(entryvec)))[i]).pred);
if (i > 1 && tmp) pfree(tmp);
tmp = out;
}
*sizep = VARSIZE( out );
#ifdef GIST_DEBUG
elog(NOTICE, "\t ENDunion %d %d", *sizep, ARRSIZE( out ) );
#endif
if ( *sizep == 0 ) {
pfree( out );
return NULL;
}
return(out);
}
/*
** GiST Compress and Decompress methods
*/
GISTENTRY *
g_int_compress(GISTENTRY *entry)
{
GISTENTRY *retval;
ArrayType * r;
int len;
int *dr;
int i,min,cand;
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for compression");
if ( ARRISNULL( (ArrayType *) entry->pred ) ) {
#ifdef GIST_DEBUG
elog(NOTICE,"COMP IN: NULL");
#endif
gistentryinit(*retval, (char *)NULL, entry->rel, entry->page, entry->offset,
0, FALSE);
return( retval );
}
r = copy_intArrayType( (ArrayType *) entry->pred );
if ( entry->leafkey ) PREPAREARR( r );
len = ARRSIZE( r );
#ifdef GIST_DEBUG
elog(NOTICE, "COMP IN: %d leaf; %d rel; %d page; %d offset; %d bytes; %d elems", entry->leafkey, (int)entry->rel, (int)entry->page, (int)entry->offset, (int)entry->bytes, len);
//printarr( r, len );
#endif
if ( len >= 2*MAXNUMRANGE ) { /*compress*/
r = resize_intArrayType( r, 2*( len ) );
dr = ARRPTR( r );
for(i=len-1; i>=0;i--)
dr[2*i] = dr[2*i+1] = dr[i];
len *= 2;
cand = 1;
while( len > MAXNUMRANGE * 2 ) {
min = 0x7fffffff;
for( i=2; i<len;i+=2 )
if ( min > (dr[i] - dr[i-1]) ) {
min = (dr[i] - dr[i-1]);
cand = i;
}
memmove( (void*)&dr[cand-1], (void*)&dr[cand+1], (len - cand - 1)*sizeof(int) );
len -= 2;
}
r = resize_intArrayType(r, len );
}
gistentryinit(*retval, (char *)r, entry->rel, entry->page, entry->offset, VARSIZE( r ), FALSE);
return(retval);
}
GISTENTRY *
g_int_decompress(GISTENTRY *entry)
{
GISTENTRY *retval;
ArrayType * r;
int *dr, lenr;
ArrayType * in;
int lenin;
int *din;
int i,j;
if ( entry->bytes < ARR_OVERHEAD( NDIM ) || ARRISNULL( (ArrayType *) entry->pred ) ) {
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for decompression");
gistentryinit(*retval, (char *)NULL, entry->rel, entry->page, entry->offset, 0, FALSE);
#ifdef GIST_DEBUG
elog(NOTICE,"DECOMP IN: NULL");
#endif
return( retval );
}
in = (ArrayType *) entry->pred;
lenin = ARRSIZE(in);
din = ARRPTR(in);
if ( lenin < 2*MAXNUMRANGE ) { /*not comressed value*/
/* sometimes strange bytesize */
gistentryinit(*entry, (char *)in, entry->rel, entry->page, entry->offset, VARSIZE( in ), FALSE);
return (entry);
}
#ifdef GIST_DEBUG
elog(NOTICE, "DECOMP IN: %d leaf; %d rel; %d page; %d offset; %d bytes; %d elems", entry->leafkey, (int)entry->rel, (int)entry->page, (int)entry->offset, (int)entry->bytes, lenin);
//printarr( in, lenin );
#endif
lenr = internal_size(din, lenin);
r = new_intArrayType( lenr );
dr = ARRPTR( r );
for(i=0;i<lenin;i+=2)
for(j=din[i]; j<=din[i+1]; j++)
if ( (!i) || *(dr-1) != j )
*dr++ = j;
retval = palloc(sizeof(GISTENTRY));
if ( ! retval )
elog(ERROR,"Can't allocate memory for decompression");
gistentryinit(*retval, (char *)r, entry->rel, entry->page, entry->offset, VARSIZE( r ), FALSE);
return(retval);
}
/*
** The GiST Penalty method for _intments
*/
float *
g_int_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
{
Datum ud;
float tmp1, tmp2;
#ifdef GIST_DEBUG
elog(NOTICE, "penalty");
#endif
ud = (Datum)inner_int_union((ArrayType *)(origentry->pred), (ArrayType *)(newentry->pred));
rt__int_size((ArrayType *)ud, &tmp1);
rt__int_size((ArrayType *)(origentry->pred), &tmp2);
*result = tmp1 - tmp2;
pfree((char *)ud);
#ifdef GIST_DEBUG
elog(NOTICE, "--penalty\t%g", *result);
#endif
return(result);
}
/*
** The GiST PickSplit method for _intments
** We use Guttman's poly time split algorithm
*/
GIST_SPLITVEC *
g_int_picksplit(bytea *entryvec,
GIST_SPLITVEC *v)
{
OffsetNumber i, j;
ArrayType *datum_alpha, *datum_beta;
ArrayType *datum_l, *datum_r;
ArrayType *union_d, *union_dl, *union_dr;
ArrayType *inter_d;
bool firsttime;
float size_alpha, size_beta, size_union, size_inter;
float size_waste, waste;
float size_l, size_r;
int nbytes;
OffsetNumber seed_1 = 0, seed_2 = 0;
OffsetNumber *left, *right;
OffsetNumber maxoff;
#ifdef GIST_DEBUG
elog(NOTICE, "--------picksplit %d",(VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY));
#endif
maxoff = ((VARSIZE(entryvec) - VARHDRSZ)/sizeof(GISTENTRY)) - 2;
nbytes = (maxoff + 2) * sizeof(OffsetNumber);
v->spl_left = (OffsetNumber *) palloc(nbytes);
v->spl_right = (OffsetNumber *) palloc(nbytes);
firsttime = true;
waste = 0.0;
for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i)) {
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[i].pred);
for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j)) {
datum_beta = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[j].pred);
/* compute the wasted space by unioning these guys */
/* size_waste = size_union - size_inter; */
union_d = (ArrayType *)inner_int_union(datum_alpha, datum_beta);
rt__int_size(union_d, &size_union);
inter_d = (ArrayType *)inner_int_inter(datum_alpha, datum_beta);
rt__int_size(inter_d, &size_inter);
size_waste = size_union - size_inter;
pfree(union_d);
if (inter_d != (ArrayType *) NULL)
pfree(inter_d);
/*
* are these a more promising split that what we've
* already seen?
*/
if (size_waste > waste || firsttime) {
waste = size_waste;
seed_1 = i;
seed_2 = j;
firsttime = false;
}
}
}
left = v->spl_left;
v->spl_nleft = 0;
right = v->spl_right;
v->spl_nright = 0;
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[seed_1].pred);
datum_l = copy_intArrayType( datum_alpha );
rt__int_size((ArrayType *)datum_l, &size_l);
datum_beta = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[seed_2].pred);
datum_r = copy_intArrayType( datum_beta );
rt__int_size((ArrayType *)datum_r, &size_r);
/*
* Now split up the regions between the two seeds. An important
* property of this split algorithm is that the split vector v
* has the indices of items to be split in order in its left and
* right vectors. We exploit this property by doing a merge in
* the code that actually splits the page.
*
* For efficiency, we also place the new index tuple in this loop.
* This is handled at the very end, when we have placed all the
* existing tuples and i == maxoff + 1.
*/
maxoff = OffsetNumberNext(maxoff);
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
/*
* If we've already decided where to place this item, just
* put it on the right list. Otherwise, we need to figure
* out which page needs the least enlargement in order to
* store the item.
*/
if (i == seed_1) {
*left++ = i;
v->spl_nleft++;
continue;
} else if (i == seed_2) {
*right++ = i;
v->spl_nright++;
continue;
}
/* okay, which page needs least enlargement? */
datum_alpha = (ArrayType *)(((GISTENTRY *)(VARDATA(entryvec)))[i].pred);
union_dl = (ArrayType *)inner_int_union(datum_l, datum_alpha);
union_dr = (ArrayType *)inner_int_union(datum_r, datum_alpha);
rt__int_size((ArrayType *)union_dl, &size_alpha);
rt__int_size((ArrayType *)union_dr, &size_beta);
/* pick which page to add it to */
if (size_alpha - size_l < size_beta - size_r) {
if ( datum_l ) pfree(datum_l);
if ( union_dr ) pfree(union_dr);
datum_l = union_dl;
size_l = size_alpha;
*left++ = i;
v->spl_nleft++;
} else {
if ( datum_r ) pfree(datum_r);
if ( union_dl ) pfree(union_dl);
datum_r = union_dr;
size_r = size_beta;
*right++ = i;
v->spl_nright++;
}
}
/**left = *right = FirstOffsetNumber;*/ /* sentinel value, see dosplit() */
if ( *(left-1) > *(right-1) ) {
*right = FirstOffsetNumber;
*(left-1) = InvalidOffsetNumber;
} else {
*left = FirstOffsetNumber;
*(right-1) = InvalidOffsetNumber;
}
v->spl_ldatum = (char *)datum_l;
v->spl_rdatum = (char *)datum_r;
#ifdef GIST_DEBUG
elog(NOTICE, "--------ENDpicksplit %d %d",v->spl_nleft, v->spl_nright);
#endif
return v;
}
/*
** Equality methods
*/
bool *
g_int_same(ArrayType *b1, ArrayType *b2, bool *result)
{
if (_int_same(b1, b2))
*result = TRUE;
else *result = FALSE;
return(result);
}
bool
g_int_internal_consistent(ArrayType *key,
ArrayType *query,
StrategyNumber strategy)
{
bool retval;
#ifdef GIST_QUERY_DEBUG
elog(NOTICE, "internal_consistent, %d", strategy);
#endif
switch(strategy) {
case RTOverlapStrategyNumber:
retval = (bool)inner_int_overlap(key, query);
break;
case RTSameStrategyNumber:
case RTContainsStrategyNumber:
retval = (bool)inner_int_contains(key, query);
break;
case RTContainedByStrategyNumber:
retval = (bool)inner_int_overlap(key, query);
break;
default:
retval = FALSE;
}
return(retval);
}
bool
_int_contained(ArrayType *a, ArrayType *b)
{
return ( _int_contains(b, a) );
}
bool
_int_contains ( ArrayType *a, ArrayType *b ) {
bool res;
ArrayType *an, *bn;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
an = copy_intArrayType( a );
bn = copy_intArrayType( b );
PREPAREARR(an);
PREPAREARR(bn);
res = inner_int_contains( an, bn );
pfree( an ); pfree( bn );
return res;
}
bool
inner_int_contains ( ArrayType *a, ArrayType *b ) {
int na, nb;
int i,j, n;
int *da, *db;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
#ifdef GIST_DEBUG
elog(NOTICE, "contains %d %d", na, nb);
#endif
i = j = n = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] ) {
n++; i++; j++;
} else
j++;
return ( n == nb ) ? TRUE : FALSE;
}
/*****************************************************************************
* Operator class for R-tree indexing
*****************************************************************************/
bool
_int_different(ArrayType *a, ArrayType *b)
{
return ( !_int_same( a, b ) );
}
bool
_int_same ( ArrayType *a, ArrayType *b ) {
int na , nb ;
int n;
int *da, *db;
bool anull = ARRISNULL( a );
bool bnull = ARRISNULL( b );
if ( anull || bnull )
return ( anull && bnull ) ? TRUE : FALSE;
SORT( a );
SORT( b );
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
if ( na != nb ) return FALSE;
n = 0;
for(n=0; n<na; n++)
if ( da[n] != db[n] )
return FALSE;
return TRUE;
}
/* _int_overlap -- does a overlap b?
*/
bool
_int_overlap ( ArrayType *a, ArrayType *b ) {
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
SORT(a);
SORT(b);
return inner_int_overlap( a, b );
}
bool
inner_int_overlap ( ArrayType *a, ArrayType *b ) {
int na , nb ;
int i,j;
int *da, *db;
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
#ifdef GIST_DEBUG
elog(NOTICE, "g_int_overlap");
#endif
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] )
return TRUE;
else
j++;
return FALSE;
}
ArrayType *
_int_union ( ArrayType *a, ArrayType *b ) {
if ( ! ARRISNULL( a ) ) SORT(a);
if ( ! ARRISNULL( b ) ) SORT(b);
return inner_int_union( a, b );
}
ArrayType *
inner_int_union ( ArrayType *a, ArrayType *b ) {
ArrayType * r = NULL;
int na , nb;
int *da, *db, *dr;
int i,j;
#ifdef GIST_DEBUG
//elog(NOTICE, "inner_union %d %d", ARRISNULL( a ) , ARRISNULL( b ) );
#endif
if ( ARRISNULL( a ) && ARRISNULL( b ) ) return new_intArrayType(0);
if ( ARRISNULL( a ) ) r = copy_intArrayType( b );
if ( ARRISNULL( b ) ) r = copy_intArrayType( a );
if ( r ) {
dr = ARRPTR( r );
} else {
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
r = new_intArrayType( na + nb );
dr = ARRPTR( r );
/* union */
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
*dr++ = da[i++];
else
*dr++ = db[j++];
while( i<na ) *dr++ = da[i++];
while( j<nb ) *dr++ = db[j++];
}
if ( ARRSIZE(r) > 1 )
r = _int_unique( r );
return r;
}
ArrayType *
_int_inter ( ArrayType *a, ArrayType *b ) {
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return FALSE;
SORT(a);
SORT(b);
return inner_int_inter( a, b );
}
ArrayType *
inner_int_inter ( ArrayType *a, ArrayType *b ) {
ArrayType * r;
int na , nb ;
int *da, *db, *dr;
int i,j;
#ifdef GIST_DEBUG
//elog(NOTICE, "inner_inter %d %d", ARRISNULL( a ), ARRISNULL( b ) );
#endif
if ( ARRISNULL( a ) || ARRISNULL( b ) ) return NULL;
na = ARRSIZE( a );
nb = ARRSIZE( b );
da = ARRPTR( a );
db = ARRPTR( b );
r = new_intArrayType( min(na, nb) );
dr = ARRPTR( r );
i = j = 0;
while( i<na && j<nb )
if ( da[i] < db[j] )
i++;
else if ( da[i] == db[j] ) {
if ( i+j == 0 || ( i+j>0 && *(dr-1) != db[j] ) )
*dr++ = db[j];
i++; j++;
} else
j++;
if ( (dr - ARRPTR(r)) == 0 ) {
pfree( r );
return NULL;
} else
return resize_intArrayType(r, dr - ARRPTR(r) );
}
void
rt__int_size(ArrayType *a, float *size)
{
if ( ARRISNULL( a ) )
*size = 0.0;
else
*size = (float)ARRSIZE( a );
return;
}
/*****************************************************************************
* Miscellaneous operators and functions
*****************************************************************************/
/* len >= 2 */
bool isort ( int *a, int len ) {
int tmp, index;
int *cur, *end;
bool r = FALSE;
end = a + len;
do {
index = 0;
cur = a + 1;
while( cur < end ) {
if( *(cur-1) > *cur ) {
tmp=*(cur-1); *(cur-1) = *cur; *cur=tmp;
index = 1;
} else if ( ! r && *(cur-1) == *cur )
r = TRUE;
cur++;
}
} while( index );
return r;
}
ArrayType * new_intArrayType( int num ) {
ArrayType * r;
int nbytes = ARR_OVERHEAD( NDIM ) + sizeof(int)*num;
r = (ArrayType *) palloc( nbytes );
if ( ! r )
elog(ERROR, "Can't allocate memory for new array");
MemSet(r, 0, nbytes);
r->size = nbytes;
r->ndim = NDIM;
#ifndef PGSQL71
SET_LO_FLAG(false, r);
#endif
*( (int*)ARR_DIMS(r) ) = num;
*( (int*)ARR_LBOUND(r) ) = 1;
return r;
}
ArrayType * resize_intArrayType( ArrayType * a, int num ) {
int nbytes = ARR_OVERHEAD( NDIM ) + sizeof(int)*num;
if ( num == ARRSIZE(a) ) return a;
a = (ArrayType *) repalloc( a, nbytes );
if ( ! a )
elog(ERROR, "Can't reallocate memory for new array");
a->size = nbytes;
*( (int*)ARR_DIMS(a) ) = num;
return a;
}
ArrayType * copy_intArrayType( ArrayType * a ) {
ArrayType * r;
if ( ! a ) return NULL;
r = new_intArrayType( ARRSIZE(a) );
memmove(r,a,VARSIZE(a));
return r;
}
/* num for compressed key */
int internal_size (int *a, int len ) {
int i,size=0;
for(i=0;i<len;i+=2)
if ( ! i || a[i] != a[i-1] ) /* do not count repeated range */
size += a[i+1] - a[i] + 1;
return size;
}
/* r is sorted and size of r > 1 */
ArrayType * _int_unique( ArrayType * r ) {
int *tmp, *dr, *data;
int num = ARRSIZE(r);
data = tmp = dr = ARRPTR( r );
while( tmp - data < num )
if ( *tmp != *dr )
*(++dr) = *tmp++;
else
tmp++;
return resize_intArrayType(r, dr + 1 - ARRPTR(r) );
}

@ -0,0 +1,211 @@
-- Create the user-defined type for the 1-D frloating point indervals (_int4)
--
BEGIN TRANSACTION;
--
-- External C-functions for R-tree methods
--
-- Comparison methods
CREATE FUNCTION _int_contains(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contains'::text
FROM pg_proc
WHERE proname = '_int_contains'::name;
CREATE FUNCTION _int_contained(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'contained in'::text
FROM pg_proc
WHERE proname = '_int_contained'::name;
CREATE FUNCTION _int_overlap(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'overlaps'::text
FROM pg_proc
WHERE proname = '_int_overlap'::name;
CREATE FUNCTION _int_same(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'same as'::text
FROM pg_proc
WHERE proname = '_int_same'::name;
CREATE FUNCTION _int_different(_int4, _int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
INSERT INTO pg_description (objoid, description)
SELECT oid, 'different'::text
FROM pg_proc
WHERE proname = '_int_different'::name;
-- support routines for indexing
CREATE FUNCTION _int_union(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION _int_inter(_int4, _int4) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
--
-- OPERATORS
--
CREATE OPERATOR && (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_overlap,
COMMUTATOR = '&&',
RESTRICT = contsel, JOIN = contjoinsel
);
--CREATE OPERATOR = (
-- LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_same,
-- COMMUTATOR = '=', NEGATOR = '<>',
-- RESTRICT = eqsel, JOIN = eqjoinsel,
-- SORT1 = '<', SORT2 = '<'
--);
CREATE OPERATOR <> (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_different,
COMMUTATOR = '<>', NEGATOR = '=',
RESTRICT = neqsel, JOIN = neqjoinsel
);
CREATE OPERATOR @ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contains,
COMMUTATOR = '~', RESTRICT = contsel, JOIN = contjoinsel
);
CREATE OPERATOR ~ (
LEFTARG = _int4, RIGHTARG = _int4, PROCEDURE = _int_contained,
COMMUTATOR = '@', RESTRICT = contsel, JOIN = contjoinsel
);
-- define the GiST support methods
CREATE FUNCTION g_int_consistent(opaque,_int4,int4) RETURNS bool
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_compress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_decompress(opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_penalty(opaque,opaque,opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_picksplit(opaque, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_union(bytea, opaque) RETURNS _int4
AS 'MODULE_PATHNAME' LANGUAGE 'c';
CREATE FUNCTION g_int_same(_int4, _int4, opaque) RETURNS opaque
AS 'MODULE_PATHNAME' LANGUAGE 'c';
-- register the default opclass for indexing
INSERT INTO pg_opclass (opcname, opcdeftype)
SELECT 'gist__int_ops', oid
FROM pg_type
WHERE typname = '_int4';
-- get the comparators for _intments and store them in a tmp table
SELECT o.oid AS opoid, o.oprname
INTO TABLE _int_ops_tmp
FROM pg_operator o, pg_type t
WHERE o.oprleft = t.oid and o.oprright = t.oid
and t.typname = '_int4';
-- make sure we have the right operators
-- SELECT * from _int_ops_tmp;
-- using the tmp table, generate the amop entries
-- _int_overlap
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 3
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '&&';
-- _int_same
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 6
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '=';
-- _int_contains
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 7
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '@';
-- _int_contained
INSERT INTO pg_amop (amopid, amopclaid, amopopr, amopstrategy)
SELECT am.oid, opcl.oid, c.opoid, 8
FROM pg_am am, pg_opclass opcl, _int_ops_tmp c
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and c.oprname = '~';
DROP TABLE _int_ops_tmp;
-- add the entries to amproc for the support methods
-- note the amprocnum numbers associated with each are specific!
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 1
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_consistent';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 2
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_union';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 3
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_compress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 4
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_decompress';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 5
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_penalty';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 6
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_picksplit';
INSERT INTO pg_amproc (amid, amopclaid, amproc, amprocnum)
SELECT am.oid, opcl.oid, pro.oid, 7
FROM pg_am am, pg_opclass opcl, pg_proc pro
WHERE amname = 'gist' and opcname = 'gist__int_ops'
and proname = 'g_int_same';
END TRANSACTION;

@ -0,0 +1,104 @@
#!/usr/bin/perl
use strict;
# make sure we are in a sane environment.
use DBI();
use DBD::Pg();
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );
use Getopt::Std;
my %opt;
getopts('d:b:s:veorauc', \%opt);
if ( !( scalar %opt && defined $opt{s} ) ) {
print <<EOT;
Usage:
$0 -d DATABASE -s SECTIONS [-b NUMBER] [-v] [-e] [-o] [-r] [-a] [-u]
-d DATABASE -DATABASE
-b NUMBER -number of repeats
-s SECTIONS -sections, format sid1[,sid2[,sid3[...]]]]
-v -verbose (show SQL)
-e -show explain
-r -use RD-tree index
-a -AND section
-o -show output
-u -unique
-c -count
EOT
exit;
}
$opt{d} ||= '_int4';
my $dbi=DBI->connect('DBI:Pg:dbname='.$opt{d});
my %table;
my @where;
$table{message}=1;
if ( $opt{a} ) {
if ( $opt{r} ) {
push @where, "message.sections @ '{$opt{s}}'";
} else {
foreach my $sid ( split(/[,\s]+/, $opt{s} )) {
push @where, "EXISTS ( select message_section_map.mid from message_section_map where message.mid=message_section_map.mid and message_section_map.sid = $sid )";
}
}
} else {
if ( $opt{r} ) {
push @where, "message.sections && '{$opt{s}}'";
} else {
$table{message_section_map} = 1;
push @where, "message.mid = message_section_map.mid";
push @where, "message_section_map.sid in ($opt{s})";
}
}
my $outf;
if ( $opt{c} ) {
$outf = ( $opt{u} ) ? 'count( distinct message.mid )' : 'count( message.mid )';
} else {
$outf = ( $opt{u} ) ? 'distinct( message.mid )' : 'message.mid';
}
my $sql = "select $outf from ".join(', ', keys %table)." where ".join(' AND ', @where).';';
if ( $opt{v} ) {
print "$sql\n";
}
if ( $opt{e} ) {
$dbi->do("explain $sql");
}
my $t0 = [gettimeofday];
my $count=0;
my $b=$opt{b};
$b||=1;
my @a;
foreach ( 1..$b ) {
@a=exec_sql($dbi,$sql);
$count=$#a;
}
my $elapsed = tv_interval ( $t0, [gettimeofday]);
if ( $opt{o} ) {
foreach ( @a ) {
print "$_->{mid}\t$_->{sections}\n";
}
}
print sprintf("total: %.02f sec; number: %d; for one: %.03f sec; found %d docs\n", $elapsed, $b, $elapsed/$b, $count+1 );
$dbi -> disconnect;
sub exec_sql {
my ($dbi, $sql, @keys) = @_;
my $sth=$dbi->prepare($sql) || die;
$sth->execute( @keys ) || die;
my $r;
my @row;
while ( defined ( $r=$sth->fetchrow_hashref ) ) {
push @row, $r;
}
$sth->finish;
return @row;
}

@ -0,0 +1,73 @@
#!/usr/bin/perl
use strict;
print <<EOT;
create table message (
mid int not null,
sections int[]
);
create table message_section_map (
mid int not null,
sid int not null
);
EOT
open(MSG,">message.tmp") || die;
open(MAP,">message_section_map.tmp") || die;
srand( 1 );
#foreach my $i ( 1..1778 ) {
#foreach my $i ( 1..3443 ) {
#foreach my $i ( 1..5000 ) {
#foreach my $i ( 1..29362 ) {
#foreach my $i ( 1..33331 ) {
#foreach my $i ( 1..83268 ) {
foreach my $i ( 1..200000 ) {
my @sect;
if ( rand() < 0.7 ) {
$sect[0] = int( (rand()**4)*100 );
} else {
my %hash;
@sect = grep { $hash{$_}++; $hash{$_} <= 1 } map { int( (rand()**4)*100) } 0..( int(rand()*5) );
}
if ( $#sect < 0 || rand() < 0.1 ) {
print MSG "$i\t\\N\n";
} else {
print MSG "$i\t{".join(',',@sect)."}\n";
map { print MAP "$i\t$_\n" } @sect;
}
}
close MAP;
close MSG;
copytable('message');
copytable('message_section_map');
print <<EOT;
CREATE unique index message_key on message ( mid );
--CREATE unique index message_section_map_key1 on message_section_map ( mid, sid );
CREATE unique index message_section_map_key2 on message_section_map ( sid, mid );
CREATE INDEX message_rdtree_idx on message using gist ( sections ) with ( islossy );
VACUUM ANALYZE;
select count(*) from message;
select count(*) from message_section_map;
EOT
unlink 'message.tmp', 'message_section_map.tmp';
sub copytable {
my $t = shift;
print "COPY $t from stdin;\n";
open( FFF, "$t.tmp") || die;
while(<FFF>) { print; }
close FFF;
print "\\.\n";
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,19 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
count
-------
345
(1 row)
SELECT count(*) from test__int WHERE a @ '{23,50}';
count
-------
12
(1 row)

@ -0,0 +1,15 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of seg.sql.
--
\set ECHO none
\i _int.sql
\set ECHO all
CREATE TABLE test__int( a int[] );
\copy test__int from 'data/test__int.data'
SELECT count(*) from test__int WHERE a && '{23,50}';
SELECT count(*) from test__int WHERE a @ '{23,50}';
Loading…
Cancel
Save