I have made three changes to the rtree code: one bug fix and

two performance improvements. I put an explanation of the changes at http://cs1.cs.nyu.edu/been/postgres-rtree.html The performance improvements are quite significant. All the changes are in the file src/backend/access/rtree/rtree.c I was working with the 7.1.3 code. I'm including the diff output as an attachment. Kenneth Been
24 years ago · 818fb55ac4
parent 7d94ac3b23
commit 818fb55ac4
1 changed files with 214 additions and 67 deletions
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.63 2001/07/15 22:48:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.64 2001/09/29 03:46:12 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -55,6 +55,14 @@ typedef struct SPLITVEC
 	Datum		spl_rdatum;
 } SPLITVEC;

+/* for sorting tuples by cost, for picking split */
+typedef struct SPLITCOST
+{
+	OffsetNumber	offset_number;
+	float			cost_differential;
+	bool			choose_left;
+} SPLITCOST;
+
 typedef struct RTSTATE
 {
 	FmgrInfo	unionFn;		/* union function */
@ -92,6 +100,7 @@ static OffsetNumber choose(Relation r, Page p, IndexTuple it,
 	   RTSTATE *rtstate);
 static int	nospace(Page p, IndexTuple it);
 static void initRtstate(RTSTATE *rtstate, Relation index);
+static int qsort_comp_splitcost(const void *a, const void *b);


 /*
@ -366,7 +375,12 @@ rttighten(Relation r,
 	FunctionCall2(&rtstate->sizeFn, datum,
 				  PointerGetDatum(&newd_size));

-	if (newd_size != old_size)
+	/*
+	 * If newd_size == 0 we have degenerate rectangles, so we
+	 * don't know if there was any change, so we have to
+	 * assume there was.
+	 */
+	if ((newd_size == 0) || (newd_size != old_size))
 	{
 		TupleDesc	td = RelationGetDescr(r);

@ -442,6 +456,8 @@ rtdosplit(Relation r,
 	OffsetNumber *spl_left,
 			   *spl_right;
 	TupleDesc	tupDesc;
+	int			n;
+	OffsetNumber newitemoff;

 	p = (Page) BufferGetPage(buffer);
 	opaque = (RTreePageOpaque) PageGetSpecialPointer(p);
@ -478,56 +494,64 @@ rtdosplit(Relation r,
 	spl_right = v.spl_right;
 	leftoff = rightoff = FirstOffsetNumber;
 	maxoff = PageGetMaxOffsetNumber(p);
-	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
-	{
-		itemid = PageGetItemId(p, i);
-		item = (IndexTuple) PageGetItem(p, itemid);
-
-		if (i == *spl_left)
-		{
-			if (PageAddItem(left, (Item) item, IndexTupleSize(item),
-							leftoff, LP_USED) == InvalidOffsetNumber)
-				elog(ERROR, "rtdosplit: failed to copy index item in %s",
-					 RelationGetRelationName(r));
-			leftoff = OffsetNumberNext(leftoff);
-			spl_left++;			/* advance in left split vector */
-		}
-		else
-		{
-			Assert(i == *spl_right);
-			if (PageAddItem(right, (Item) item, IndexTupleSize(item),
-							rightoff, LP_USED) == InvalidOffsetNumber)
-				elog(ERROR, "rtdosplit: failed to copy index item in %s",
-					 RelationGetRelationName(r));
-			rightoff = OffsetNumberNext(rightoff);
-			spl_right++;		/* advance in right split vector */
-		}
-	}
+	newitemoff = OffsetNumberNext(maxoff);

 	/* build an InsertIndexResult for this insertion */
 	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

-	/* now insert the new index tuple */
-	if (*spl_left == maxoff + 1)
+	/*
+	 * spl_left contains a list of the offset numbers of the 
+	 * tuples that will go to the left page.  For each offset
+	 * number, get the tuple item, then add the item to the
+	 * left page.  Similarly for the right side.
+	 */
+
+	/* fill left node */
+	for (n = 0; n < v.spl_nleft; n++)
 	{
-		if (PageAddItem(left, (Item) itup, IndexTupleSize(itup),
+		i = *spl_left;
+		if (i == newitemoff)
+			item = itup;
+		else
+		{
+			itemid = PageGetItemId(p, i);
+			item = (IndexTuple) PageGetItem(p, itemid);
+		}
+
+		if (PageAddItem(left, (Item) item, IndexTupleSize(item),
 						leftoff, LP_USED) == InvalidOffsetNumber)
 			elog(ERROR, "rtdosplit: failed to add index item to %s",
 				 RelationGetRelationName(r));
 		leftoff = OffsetNumberNext(leftoff);
-		ItemPointerSet(&(res->pointerData), lbknum, leftoff);
-		spl_left++;
+
+		if (i == newitemoff)
+			ItemPointerSet(&(res->pointerData), lbknum, leftoff);
+
+		spl_left++;		/* advance in left split vector */
 	}
-	else
+
+	/* fill right node */
+	for (n = 0; n < v.spl_nright; n++)
 	{
-		Assert(*spl_right == maxoff + 1);
-		if (PageAddItem(right, (Item) itup, IndexTupleSize(itup),
+		i = *spl_right;
+		if (i == newitemoff)
+			item = itup;
+		else
+		{
+			itemid = PageGetItemId(p, i);
+			item = (IndexTuple) PageGetItem(p, itemid);
+		}
+
+		if (PageAddItem(right, (Item) item, IndexTupleSize(item),
 						rightoff, LP_USED) == InvalidOffsetNumber)
 			elog(ERROR, "rtdosplit: failed to add index item to %s",
 				 RelationGetRelationName(r));
 		rightoff = OffsetNumberNext(rightoff);
-		ItemPointerSet(&(res->pointerData), rbknum, rightoff);
-		spl_right++;
+
+		if (i == newitemoff)
+			ItemPointerSet(&(res->pointerData), rbknum, rightoff);
+
+		spl_right++;		/* advance in right split vector */
 	}

 	/* Make sure we consumed all of the split vectors, and release 'em */
@ -680,8 +704,10 @@ rtnewroot(Relation r, IndexTuple lt, IndexTuple rt)
 * In addition, the item to be added (itup) is listed in the appropriate
 * vector.	It is represented by item number N+1 (N = # of items on page).
 *
- * Both vectors appear in sequence order with a terminating sentinel value
- * of InvalidOffsetNumber.
+ * Both vectors have a terminating sentinel value of InvalidOffsetNumber,
+ * but the sentinal value is no longer used, because the SPLITVEC
+ * vector also contains the length of each vector, and that information
+ * is now used to iterate over them in rtdosplit(). --kbb, 21 Sept 2001
 *
 * The bounding-box datums for the two new pages are also returned in *v.
 *
@ -736,6 +762,12 @@ rtpicksplit(Relation r,
 				item_2_sz,
 				left_avail_space,
 				right_avail_space;
+	int			total_num_tuples,
+				num_tuples_without_seeds,
+				max_after_split; /* in Guttman's lingo, (M - m) */
+	float		diff; /* diff between cost of putting tuple left or right */
+	SPLITCOST   *cost_vector;
+	int			n;

 	/*
 	 * First, make sure the new item is not so large that we can't
@ -751,6 +783,9 @@ rtpicksplit(Relation r,
 	maxoff = PageGetMaxOffsetNumber(page);
 	newitemoff = OffsetNumberNext(maxoff);		/* phony index for new
 												 * item */
+	total_num_tuples = newitemoff;
+	num_tuples_without_seeds = total_num_tuples - 2;
+	max_after_split = total_num_tuples / 2;		/* works for m = M/2 */

 	/* Make arrays big enough for worst case, including sentinel */
 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
@ -848,47 +883,111 @@ rtpicksplit(Relation r,
 	right_avail_space = RTPageAvailSpace - IndexTupleTotalSize(item_2);

 	/*
-	 * Now split up the regions between the two seeds.	An important
-	 * property of this split algorithm is that the split vector v has the
-	 * indices of items to be split in order in its left and right
-	 * vectors.  We exploit this property by doing a merge in the code
-	 * that actually splits the page.
+	 * Now split up the regions between the two seeds.
+	 *
+	 * The cost_vector array will contain hints for determining where
+	 * each tuple should go.  Each record in the array will contain
+	 * a boolean, choose_left, that indicates which node the tuple
+	 * prefers to be on, and the absolute difference in cost between
+	 * putting the tuple in its favored node and in the other node.
+	 *
+	 * Later, we will sort the cost_vector in descending order by cost
+	 * difference, and consider the tuples in that order for
+	 * placement.  That way, the tuples that *really* want to be in
+	 * one node or the other get to choose first, and the tuples that
+	 * don't really care choose last.
+	 *
+	 * First, build the cost_vector array.  The new index tuple will
+	 * also be handled in this loop, and represented in the array,
+	 * with i==newitemoff.
+	 *
+	 * In the case of variable size tuples it is possible that we only
+	 * have the two seeds and no other tuples, in which case we don't
+	 * do any of this cost_vector stuff.
+	 */
+
+	/* to keep compiler quiet */
+	cost_vector = (SPLITCOST *) NULL;
+
+	if (num_tuples_without_seeds > 0)
+	{
+		cost_vector =
+			(SPLITCOST *) palloc(num_tuples_without_seeds * sizeof(SPLITCOST));
+		n = 0;
+		for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+		{
+			/* Compute new union datums and sizes for both choices */
+
+			if ((i == seed_1) || (i == seed_2))
+				continue;
+			else if (i == newitemoff)
+				item_1 = itup;
+			else
+				item_1 = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
+
+			datum_alpha = IndexTupleGetDatum(item_1);
+			union_dl = FunctionCall2(&rtstate->unionFn, datum_l, datum_alpha);
+			union_dr = FunctionCall2(&rtstate->unionFn, datum_r, datum_alpha);
+			FunctionCall2(&rtstate->sizeFn, union_dl,
+						  PointerGetDatum(&size_alpha));
+			FunctionCall2(&rtstate->sizeFn, union_dr,
+						  PointerGetDatum(&size_beta));
+
+			diff = (size_alpha - size_l) - (size_beta - size_r);
+
+			cost_vector[n].offset_number = i;
+			cost_vector[n].cost_differential = fabs(diff);
+			cost_vector[n].choose_left = (diff < 0);
+
+			n++;
+		}
+
+		/*
+		 * Sort the array.  The function qsort_comp_splitcost is
+		 * set up "backwards", to provided descending order.
+		 */
+		qsort(cost_vector, num_tuples_without_seeds, sizeof(SPLITCOST),
+			  &qsort_comp_splitcost);
+	}
+
+	/*
+	 * Now make the final decisions about where each tuple will go,
+	 * and build the vectors to return in the SPLITVEC record.
 	 *
-	 * For efficiency, we also place the new index tuple in this loop. This
-	 * is handled at the very end, when we have placed all the existing
-	 * tuples and i == maxoff + 1.
+	 * The cost_vector array contains (descriptions of) all the
+	 * tuples, in the order that we want to consider them, so we
+	 * we just iterate through it and place each tuple in left
+	 * or right nodes, according to the criteria described below.
 	 */
+
 	left = v->spl_left;
 	v->spl_nleft = 0;
 	right = v->spl_right;
 	v->spl_nright = 0;

-	for (i = FirstOffsetNumber; i <= newitemoff; i = OffsetNumberNext(i))
+	/* Place the seeds first.
+	 * left avail space, left union, right avail space, and right
+	 * union have already been adjusted for the seeds.
+	 */
+
+	*left++ = seed_1;
+	v->spl_nleft++;
+
+	*right++ = seed_2;
+	v->spl_nright++;
+
+	for (n = 0; n < num_tuples_without_seeds; n++)
 	{
 		bool		left_feasible,
 					right_feasible,
 					choose_left;

 		/*
-		 * If we've already decided where to place this item, just put it
-		 * on the correct list.  Otherwise, we need to figure out which
-		 * page needs the least enlargement in order to store the item.
+		 * We need to figure out which page needs the least
+		 * enlargement in order to store the item.
 		 */

-		if (i == seed_1)
-		{
-			*left++ = i;
-			v->spl_nleft++;
-			/* left avail_space & union already includes this one */
-			continue;
-		}
-		if (i == seed_2)
-		{
-			*right++ = i;
-			v->spl_nright++;
-			/* right avail_space & union already includes this one */
-			continue;
-		}
+		i = cost_vector[n].offset_number;

 		/* Compute new union datums and sizes for both possible additions */
 		if (i == newitemoff)
@ -918,6 +1017,24 @@ rtpicksplit(Relation r,
 		 * (We know that all the old items together can fit on one page, so
 		 * we need not worry about any other problem than failing to fit
 		 * the new item.)
+		 *
+		 * Guttman's algorithm actually has two factors to consider (in
+		 * order):  1. if one node has so many tuples already assigned to
+		 * it that the other needs all the rest in order to satisfy the
+		 * condition that neither node has fewer than m tuples, then
+		 * that is decisive; 2. otherwise, choose the page that shows
+		 * the smaller enlargement of its union area.
+		 *
+		 * I have chosen m = M/2, where M is the maximum number of
+		 * tuples on a page.  (Actually, this is only strictly
+		 * true for fixed size tuples.  For variable size tuples,
+		 * there still might have to be only one tuple on a page,
+		 * if it is really big.  But even with variable size
+		 * tuples we still try to get m as close as possible to M/2.)
+		 *
+		 * The question of which page shows the smaller enlargement of
+		 * its union area has already been answered, and the answer
+		 * stored in the choose_left field of the SPLITCOST record.
 		 */
 		left_feasible = (left_avail_space >= item_1_sz &&
 						 ((left_avail_space - item_1_sz) >= newitemsz ||
@ -927,8 +1044,18 @@ rtpicksplit(Relation r,
 						   left_avail_space >= newitemsz));
 		if (left_feasible && right_feasible)
 		{
-			/* Both feasible, use Guttman's algorithm */
-			choose_left = (size_alpha - size_l < size_beta - size_r);
+			/*
+			 * Both feasible, use Guttman's algorithm.
+			 * First check the m condition described above, and if
+			 * that doesn't apply, choose the page with the smaller
+			 * enlargement of its union area.
+			 */
+			if (v->spl_nleft > max_after_split)
+				choose_left = false;
+			else if (v->spl_nright > max_after_split)
+				choose_left = true;
+			else
+				choose_left = cost_vector[n].choose_left;
 		}
 		else if (left_feasible)
 			choose_left = true;
@ -962,6 +1089,11 @@ rtpicksplit(Relation r,
 		}
 	}

+	if (num_tuples_without_seeds > 0)
+	{
+		pfree(cost_vector);
+	}
+
 	*left = *right = InvalidOffsetNumber;		/* add ending sentinels */

 	v->spl_ldatum = datum_l;
@ -1145,6 +1277,21 @@ initRtstate(RTSTATE *rtstate, Relation index)
 	return;
 }

+/* for sorting SPLITCOST records in descending order */
+static int
+qsort_comp_splitcost(const void *a, const void *b)
+{
+	float diff =
+		((SPLITCOST *)a)->cost_differential -
+		((SPLITCOST *)b)->cost_differential;
+	if (diff < 0)
+		return 1;
+	else if (diff > 0)
+		return -1;
+	else
+		return 0;
+}
+
 #ifdef RTDEBUG

 void