Code review of CLUSTER patch. Clean up problems with relcache getting

confused, toasted data getting lost, etc.
24 years ago · e44beef712
parent 9bccdf17f7
commit e44beef712
12 changed files with 558 additions and 429 deletions
--- a/doc/src/sgml/release.sgml
+++ b/doc/src/sgml/release.sgml
@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/release.sgml,v 1.145 2002/08/02 18:15:04 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/release.sgml,v 1.146 2002/08/11 21:17:34 tgl Exp $
 -->

 <appendix id="release">
@ -24,6 +24,7 @@ CDATA means the content is "SGML-free", so you can write without
 worries about funny characters.
 -->
 <literallayout><![CDATA[
+CLUSTER is no longer hazardous to your schema
 COPY accepts a list of columns to copy
 ALTER TABLE DROP COLUMN
 CREATE OPERATOR CLASS/DROP OPERATOR CLASS
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.219 2002/08/06 02:36:33 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.220 2002/08/11 21:17:34 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -198,7 +198,8 @@ SystemAttributeByName(const char *attname, bool relhasoids)
 *		Remove the system relation specific code to elsewhere eventually.
 *
 * If storage_create is TRUE then heap_storage_create is called here,
- * else caller must call heap_storage_create later.
+ * else caller must call heap_storage_create later (or not at all,
+ * if the relation doesn't need physical storage).
 * ----------------------------------------------------------------
 */
 Relation
@ -291,7 +292,7 @@ heap_create(const char *relname,
 									 nailme);

 	/*
-	 * have the storage manager create the relation.
+	 * have the storage manager create the relation's disk file, if wanted.
 	 */
 	if (storage_create)
 		heap_storage_create(rel);
@ -684,20 +685,21 @@ heap_create_with_catalog(const char *relname,
 	tupdesc->tdhasoid = BoolToHasOid(relhasoids);
 	
 	/*
-	 * Tell heap_create not to create a physical file; we'll do that below
-	 * after all our catalog updates are done.	(This isn't really
-	 * necessary anymore, but we may as well avoid the cycles of creating
-	 * and deleting the file in case we fail.)
+	 * Create the relcache entry (mostly dummy at this point) and the
+	 * physical disk file.  (If we fail further down, it's the smgr's
+	 * responsibility to remove the disk file again.)
+	 *
+	 * NB: create a physical file only if it's not a view.
 	 */
 	new_rel_desc = heap_create(relname,
 							   relnamespace,
 							   tupdesc,
 							   shared_relation,
-							   false,
+							   (relkind != RELKIND_VIEW),
 							   allow_system_table_mods);

 	/* Fetch the relation OID assigned by heap_create */
-	new_rel_oid = new_rel_desc->rd_att->attrs[0]->attrelid;
+	new_rel_oid = RelationGetRelid(new_rel_desc);

 	/* Assign an OID for the relation's tuple type */
 	new_type_oid = newoid();
@ -761,12 +763,6 @@ heap_create_with_catalog(const char *relname,
 	 */
 	StoreConstraints(new_rel_desc, tupdesc);

-	/*
-	 * We create the disk file for this relation here
-	 */
-	if (relkind != RELKIND_VIEW)
-		heap_storage_create(new_rel_desc);
-
 	/*
 	 * ok, the relation has been cataloged, so close our relations and
 	 * return the oid of the newly created relation.
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.188 2002/08/05 03:29:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.189 2002/08/11 21:17:34 tgl Exp $
 *
 *
 * INTERFACE ROUTINES
@ -578,14 +578,18 @@ index_create(Oid heapRelationId,

 	indexTupDesc->tdhasoid = WITHOUTOID;
 	/*
-	 * create the index relation (but don't create storage yet)
+	 * create the index relation's relcache entry and physical disk file.
+	 * (If we fail further down, it's the smgr's responsibility to remove
+	 * the disk file again.)
 	 */
 	indexRelation = heap_create(indexRelationName,
 								namespaceId,
 								indexTupDesc,
 								shared_relation,
-								false,
+								true,
 								allow_system_table_mods);
+
+	/* Fetch the relation OID assigned by heap_create */
 	indexoid = RelationGetRelid(indexRelation);

 	/*
@ -611,11 +615,6 @@ index_create(Oid heapRelationId,
 	 */
 	UpdateRelationRelation(indexRelation);

-	/*
-	 * We create the disk file for this relation here
-	 */
-	heap_storage_create(indexRelation);
-
 	/*
 	 * now update the object id's of all the attribute tuple forms in the
 	 * index relation's tuple descriptor
--- a/src/backend/catalog/pg_depend.c
+++ b/src/backend/catalog/pg_depend.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/pg_depend.c,v 1.4 2002/08/05 03:29:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/pg_depend.c,v 1.5 2002/08/11 21:17:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -121,15 +121,16 @@ recordMultipleDependencies(const ObjectAddress *depender,

 /*
 * deleteDependencyRecordsFor -- delete all records with given depender
- * classId/objectId.
+ * classId/objectId.  Returns the number of records deleted.
 *
 * This is used when redefining an existing object.  Links leading to the
 * object do not change, and links leading from it will be recreated
 * (possibly with some differences from before).
 */
-void
+long
 deleteDependencyRecordsFor(Oid classId, Oid objectId)
 {
+	long			count = 0;
 	Relation		depRel;
 	ScanKeyData		key[2];
 	SysScanDesc		scan;
@ -150,11 +151,14 @@ deleteDependencyRecordsFor(Oid classId, Oid objectId)
 	while (HeapTupleIsValid(tup = systable_getnext(scan)))
 	{
 		simple_heap_delete(depRel, &tup->t_self);
+		count++;
 	}

 	systable_endscan(scan);

 	heap_close(depRel, RowExclusiveLock);
+
+	return count;
 }

 /*
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@ -1,29 +1,25 @@
 /*-------------------------------------------------------------------------
 *
 * cluster.c
- *	  Paul Brown's implementation of cluster index.
+ *	  CLUSTER a table on an index.
+ *
+ * There is hardly anything left of Paul Brown's original implementation...
 *
- *	  I am going to use the rename function as a model for this in the
- *	  parser and executor, and the vacuum code as an example in this
- *	  file. As I go - in contrast to the rest of postgres - there will
- *	  be BUCKETS of comments. This is to allow reviewers to understand
- *	  my (probably bogus) assumptions about the way this works.
- *														[pbrown '94]
 *
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994-5, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.85 2002/08/10 21:00:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.86 2002/08/11 21:17:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
-
 #include "postgres.h"

 #include "access/genam.h"
 #include "access/heapam.h"
+#include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
@ -40,25 +36,23 @@
 /*
 * We need one of these structs for each index in the relation to be
 * clustered.  It's basically the data needed by index_create() so
- * we can recreate the indexes after destroying the old heap.
+ * we can rebuild the indexes on the new heap.
 */
 typedef struct
 {
+	Oid			indexOID;
 	char	   *indexName;
 	IndexInfo  *indexInfo;
 	Oid			accessMethodOID;
 	Oid		   *classOID;
-	Oid			indexOID;
-	bool		isPrimary;
 } IndexAttrs;

-static Oid	copy_heap(Oid OIDOldHeap, const char *NewName);
-static void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
-static List *get_indexattr_list (Oid OIDOldHeap);
+static Oid	make_new_heap(Oid OIDOldHeap, const char *NewName);
+static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
+static List *get_indexattr_list(Relation OldHeap);
 static void recreate_indexattr(Oid OIDOldHeap, List *indexes);
 static void swap_relfilenodes(Oid r1, Oid r2);

-Relation RelationIdGetRelation(Oid relationId);

 /*
 * cluster
@ -67,17 +61,14 @@ Relation RelationIdGetRelation(Oid relationId);
 * swapping the relfilenodes of the new table and the old table, so
 * the OID of the original table is preserved.  Thus we do not lose
 * GRANT, inheritance nor references to this table (this was a bug
- * in releases thru 7.3)
+ * in releases thru 7.3).
 *
- * Also create new indexes and swap the filenodes with the old indexes
- * the same way we do for the relation.
+ * Also create new indexes and swap the filenodes with the old indexes the
+ * same way we do for the relation.  Since we are effectively bulk-loading
+ * the new table, it's better to create the indexes afterwards than to fill
+ * them incrementally while we load the table.
 *
- * TODO:
- * 		maybe we can get away with AccessShareLock for the table.
- * 		Concurrency would be much improved.  Only acquire
- * 		AccessExclusiveLock right before swapping the filenodes.
- * 		This would allow users to CLUSTER on a regular basis,
- * 		practically eliminating the need for auto-clustered indexes.
+ * Permissions checks were done already.
 */
 void
 cluster(RangeVar *oldrelation, char *oldindexname)
@ -105,43 +96,60 @@ cluster(RangeVar *oldrelation, char *oldindexname)
 									RelationGetNamespace(OldHeap));
 	if (!OidIsValid(OIDOldIndex))
 		elog(ERROR, "CLUSTER: cannot find index \"%s\" for table \"%s\"",
-			 oldindexname, oldrelation->relname);
+			 oldindexname, RelationGetRelationName(OldHeap));
 	OldIndex = index_open(OIDOldIndex);
 	LockRelation(OldIndex, AccessExclusiveLock);

 	/*
 	 * Check that index is in fact an index on the given relation
 	 */
-	if (OldIndex->rd_index->indrelid != OIDOldHeap)
+	if (OldIndex->rd_index == NULL ||
+		OldIndex->rd_index->indrelid != OIDOldHeap)
 		elog(ERROR, "CLUSTER: \"%s\" is not an index for table \"%s\"",
-			 oldindexname, oldrelation->relname);
+			 RelationGetRelationName(OldIndex),
+			 RelationGetRelationName(OldHeap));

-	/* Drop relcache refcnts, but do NOT give up the locks */
-	heap_close(OldHeap, NoLock);
-	index_close(OldIndex);
+	/*
+	 * Disallow clustering system relations.  This will definitely NOT work
+	 * for shared relations (we have no way to update pg_class rows in other
+	 * databases), nor for nailed-in-cache relations (the relfilenode values
+	 * for those are hardwired, see relcache.c).  It might work for other
+	 * system relations, but I ain't gonna risk it.
+	 */
+	if (IsSystemRelation(OldHeap))
+		elog(ERROR, "CLUSTER: cannot cluster system relation \"%s\"",
+			 RelationGetRelationName(OldHeap));

 	/* Save the information of all indexes on the relation. */
-	indexes = get_indexattr_list(OIDOldHeap);
+	indexes = get_indexattr_list(OldHeap);
+
+	/* Drop relcache refcnts, but do NOT give up the locks */
+	index_close(OldIndex);
+	heap_close(OldHeap, NoLock);

 	/*
-	 * Create the new heap with a temporary name.
+	 * Create the new heap, using a temporary name in the same namespace
+	 * as the existing table.  NOTE: there is some risk of collision with user
+	 * relnames.  Working around this seems more trouble than it's worth; in
+	 * particular, we can't create the new heap in a different namespace from
+	 * the old, or we will have problems with the TEMP status of temp tables.
 	 */
-	snprintf(NewHeapName, NAMEDATALEN, "temp_%u", OIDOldHeap);
+	snprintf(NewHeapName, NAMEDATALEN, "pg_temp_%u", OIDOldHeap);

-	OIDNewHeap = copy_heap(OIDOldHeap, NewHeapName);
+	OIDNewHeap = make_new_heap(OIDOldHeap, NewHeapName);

-	/* We do not need CommandCounterIncrement() because copy_heap did it. */
+	/* We don't need CommandCounterIncrement() because make_new_heap did it. */

 	/*
 	 * Copy the heap data into the new table in the desired order.
 	 */
-	rebuildheap(OIDNewHeap, OIDOldHeap, OIDOldIndex);
+	copy_heap_data(OIDNewHeap, OIDOldHeap, OIDOldIndex);

-	/* To make the new heap's data visible. */
+	/* To make the new heap's data visible (probably not needed?). */
 	CommandCounterIncrement();

 	/* Swap the relfilenodes of the old and new heaps. */
-	swap_relfilenodes(OIDNewHeap, OIDOldHeap);
+	swap_relfilenodes(OIDOldHeap, OIDNewHeap);

 	CommandCounterIncrement();

@ -150,21 +158,26 @@ cluster(RangeVar *oldrelation, char *oldindexname)
 	object.objectId = OIDNewHeap;
 	object.objectSubId = 0;

-	/* The relation is local to our transaction and we know nothin
+	/*
+	 * The new relation is local to our transaction and we know nothing
 	 * depends on it, so DROP_RESTRICT should be OK.
 	 */
 	performDeletion(&object, DROP_RESTRICT);

 	/* performDeletion does CommandCounterIncrement at end */

- 	/* Recreate the indexes on the relation.  We do not need
-  	 * CommandCounterIncrement() because recreate_indexattr does it.
-   	 */
-  	recreate_indexattr(OIDOldHeap, indexes);
+	/*
+	 * Recreate each index on the relation.  We do not need
+	 * CommandCounterIncrement() because recreate_indexattr does it.
+	 */
+	recreate_indexattr(OIDOldHeap, indexes);
 }

+/*
+ * Create the new table that we will fill with correctly-ordered data.
+ */
 static Oid
-copy_heap(Oid OIDOldHeap, const char *NewName)
+make_new_heap(Oid OIDOldHeap, const char *NewName)
 {
 	TupleDesc	OldHeapDesc,
 				tupdesc;
@ -206,27 +219,29 @@ copy_heap(Oid OIDOldHeap, const char *NewName)
 	return OIDNewHeap;
 }

+/*
+ * Do the physical copying of heap data.
+ */
 static void
-rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
+copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
 {
-	Relation	LocalNewHeap,
-				LocalOldHeap,
-				LocalOldIndex;
-	IndexScanDesc ScanDesc;
-	HeapTuple	LocalHeapTuple;
+	Relation	NewHeap,
+				OldHeap,
+				OldIndex;
+	IndexScanDesc scan;
+	HeapTuple	tuple;

 	/*
 	 * Open the relations I need. Scan through the OldHeap on the OldIndex
 	 * and insert each tuple into the NewHeap.
 	 */
-	LocalNewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
-	LocalOldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
-	LocalOldIndex = index_open(OIDOldIndex);
+	NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
+	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
+	OldIndex = index_open(OIDOldIndex);

-	ScanDesc = index_beginscan(LocalOldHeap, LocalOldIndex,
-							   SnapshotNow, 0, (ScanKey) NULL);
+	scan = index_beginscan(OldHeap, OldIndex, SnapshotNow, 0, (ScanKey) NULL);

-	while ((LocalHeapTuple = index_getnext(ScanDesc, ForwardScanDirection)) != NULL)
+	while ((tuple = index_getnext(scan, ForwardScanDirection)) != NULL)
 	{
 		/*
 		 * We must copy the tuple because heap_insert() will overwrite
@ -234,199 +249,280 @@ rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
 		 * retrieved tuple will actually be in a disk buffer!  Thus,
 		 * the source relation would get trashed, which is bad news if
 		 * we abort later on.  (This was a bug in releases thru 7.0)
+		 *
+		 * Note that the copied tuple will have the original OID, if any,
+		 * so this does preserve OIDs.
 		 */
-		HeapTuple	copiedTuple = heap_copytuple(LocalHeapTuple);
+		HeapTuple	copiedTuple = heap_copytuple(tuple);
+
+		simple_heap_insert(NewHeap, copiedTuple);

-		simple_heap_insert(LocalNewHeap, copiedTuple);
 		heap_freetuple(copiedTuple);

 		CHECK_FOR_INTERRUPTS();
 	}

-	index_endscan(ScanDesc);
+	index_endscan(scan);

-	index_close(LocalOldIndex);
-	heap_close(LocalOldHeap, NoLock);
-	heap_close(LocalNewHeap, NoLock);
+	index_close(OldIndex);
+	heap_close(OldHeap, NoLock);
+	heap_close(NewHeap, NoLock);
 }

-/* Get the necessary info about the indexes in the relation and
- * return a List of IndexAttrs.
+/*
+ * Get the necessary info about the indexes of the relation and
+ * return a list of IndexAttrs structures.
 */
-List *
-get_indexattr_list (Oid OIDOldHeap)
+static List *
+get_indexattr_list(Relation OldHeap)
 {
-	ScanKeyData	entry;
-	HeapScanDesc scan;
-	Relation indexRelation;
-	HeapTuple indexTuple;
 	List *indexes = NIL;
-	IndexAttrs *attrs;
-	HeapTuple tuple;
-	Form_pg_index index;
-	
-	/* Grab the index tuples by looking into RelationRelationName
-	 * by the OID of the old heap.
-	 */
-	indexRelation = heap_openr(IndexRelationName, AccessShareLock);
-	ScanKeyEntryInitialize(&entry, 0, Anum_pg_index_indrelid,
-			F_OIDEQ, ObjectIdGetDatum(OIDOldHeap));
-	scan = heap_beginscan(indexRelation, SnapshotNow, 1, &entry);
-	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+	List	   *indlist;
+
+	/* Ask the relcache to produce a list of the indexes of the old rel */
+	foreach(indlist, RelationGetIndexList(OldHeap))
 	{
-		index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Oid		indexOID = (Oid) lfirsti(indlist);
+		HeapTuple indexTuple;
+		HeapTuple classTuple;
+		Form_pg_index indexForm;
+		Form_pg_class classForm;
+		IndexAttrs *attrs;
+
+		indexTuple = SearchSysCache(INDEXRELID,
+									ObjectIdGetDatum(indexOID),
+									0, 0, 0);
+		if (!HeapTupleIsValid(indexTuple))
+			elog(ERROR, "Cache lookup failed for index %u", indexOID);
+		indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
+		Assert(indexForm->indexrelid == indexOID);

 		attrs = (IndexAttrs *) palloc(sizeof(IndexAttrs));
-		attrs->indexInfo = BuildIndexInfo(index);
-		attrs->isPrimary = index->indisprimary;
-		attrs->indexOID = index->indexrelid;
-
-		/* The opclasses are copied verbatim from the original indexes.
-		*/
-		attrs->classOID = (Oid *)palloc(sizeof(Oid) *
-				attrs->indexInfo->ii_NumIndexAttrs);
-		memcpy(attrs->classOID, index->indclass,
-				sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs);
-
-		/* Name and access method of each index come from
-		 * RelationRelationName.
-		 */
-		tuple = SearchSysCache(RELOID,
-				ObjectIdGetDatum(attrs->indexOID),
-				0, 0, 0);
-		if (!HeapTupleIsValid(tuple))
-			elog(ERROR, "CLUSTER: cannot find index %u", attrs->indexOID);
-		attrs->indexName = pstrdup(NameStr(((Form_pg_class) GETSTRUCT(tuple))->relname));
-		attrs->accessMethodOID = ((Form_pg_class) GETSTRUCT(tuple))->relam;
-		ReleaseSysCache(tuple);
+		attrs->indexOID = indexOID;
+		attrs->indexInfo = BuildIndexInfo(indexForm);
+		attrs->classOID = (Oid *)
+			palloc(sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs);
+		memcpy(attrs->classOID, indexForm->indclass,
+			   sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs);
+
+		/* Name and access method of each index come from pg_class */
+		classTuple = SearchSysCache(RELOID,
+									ObjectIdGetDatum(indexOID),
+									0, 0, 0);
+		if (!HeapTupleIsValid(classTuple))
+			elog(ERROR, "Cache lookup failed for index %u", indexOID);
+		classForm = (Form_pg_class) GETSTRUCT(classTuple);
+
+		attrs->indexName = pstrdup(NameStr(classForm->relname));
+		attrs->accessMethodOID = classForm->relam;
+
+		ReleaseSysCache(classTuple);
+		ReleaseSysCache(indexTuple);

 		/* Cons the gathered data into the list.  We do not care about
 		 * ordering, and this is more efficient than append.
 		 */
-		indexes=lcons((void *)attrs, indexes);
+		indexes = lcons(attrs, indexes);
 	}
-	heap_endscan(scan);
-	heap_close(indexRelation, AccessShareLock);
+
 	return indexes;
 }

-/* Create new indexes and swap the filenodes with old indexes.  Then drop
- * the new index (carrying the old heap along).
+/*
+ * Create new indexes and swap the filenodes with old indexes.  Then drop
+ * the new index (carrying the old index filenode along).
 */
-void
+static void
 recreate_indexattr(Oid OIDOldHeap, List *indexes)
 {
-	IndexAttrs *attrs;
 	List 	   *elem;
-	Oid			newIndexOID;
-	char		newIndexName[NAMEDATALEN];
-	ObjectAddress object;

-	foreach (elem, indexes)
+	foreach(elem, indexes)
 	{
-		attrs=(IndexAttrs *) lfirst(elem);
+		IndexAttrs *attrs = (IndexAttrs *) lfirst(elem);
+		Oid			newIndexOID;
+		char		newIndexName[NAMEDATALEN];
+		ObjectAddress object;

 		/* Create the new index under a temporary name */
-		snprintf(newIndexName, NAMEDATALEN, "temp_%u", attrs->indexOID);
+		snprintf(newIndexName, NAMEDATALEN, "pg_temp_%u", attrs->indexOID);

-		/* The new index will have constraint status set to false,
+		/*
+		 * The new index will have primary and constraint status set to false,
 		 * but since we will only use its filenode it doesn't matter:
 		 * after the filenode swap the index will keep the constraint
 		 * status of the old index.
 		 */
 		newIndexOID = index_create(OIDOldHeap, newIndexName,
 								   attrs->indexInfo, attrs->accessMethodOID,
-								   attrs->classOID, attrs->isPrimary,
+								   attrs->classOID, false,
 								   false, allowSystemTableMods);
 		CommandCounterIncrement();

 		/* Swap the filenodes. */
-		swap_relfilenodes(newIndexOID, attrs->indexOID);
-		setRelhasindex(OIDOldHeap, true, attrs->isPrimary, InvalidOid);
+		swap_relfilenodes(attrs->indexOID, newIndexOID);
+
+		CommandCounterIncrement();

 		/* Destroy new index with old filenode */
 		object.classId = RelOid_pg_class;
 		object.objectId = newIndexOID;
 		object.objectSubId = 0;
 		
-		/* The relation is local to our transaction and we know
+		/*
+		 * The relation is local to our transaction and we know
 		 * nothing depends on it, so DROP_RESTRICT should be OK.
 		 */
 		performDeletion(&object, DROP_RESTRICT);
 		
 		/* performDeletion does CommandCounterIncrement() at its end */
-		
-		pfree(attrs->classOID);
-		pfree(attrs);
 	}
-	freeList(indexes);
 }

-/* Swap the relfilenodes for two given relations.
+/*
+ * Swap the relfilenodes for two given relations.
+ *
+ * Also swap any TOAST links, so that the toast data moves along with
+ * the main-table data.
 */
-void
+static void
 swap_relfilenodes(Oid r1, Oid r2)
 {
-	/* I can probably keep RelationRelationName open in the main
-	 * function and pass the Relation around so I don't have to open
-	 * it every time.
-	 */
 	Relation	relRelation,
 				rel;
-	HeapTuple	reltup[2];
-	Oid			tempRFNode;
+	HeapTuple	reltup1,
+				reltup2;
+	Form_pg_class relform1,
+				relform2;
+	Oid			swaptemp;
 	int			i;
 	CatalogIndexState indstate;

-	/* We need both RelationRelationName tuples.  */
+	/* We need writable copies of both pg_class tuples. */
 	relRelation = heap_openr(RelationRelationName, RowExclusiveLock);

-	reltup[0] = SearchSysCacheCopy(RELOID,
-								   ObjectIdGetDatum(r1),
-								   0, 0, 0);
-	if (!HeapTupleIsValid(reltup[0]))
+	reltup1 = SearchSysCacheCopy(RELOID,
+								 ObjectIdGetDatum(r1),
+								 0, 0, 0);
+	if (!HeapTupleIsValid(reltup1))
 		elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r1);
-	reltup[1] = SearchSysCacheCopy(RELOID,
-								   ObjectIdGetDatum(r2),
-								   0, 0, 0);
-	if (!HeapTupleIsValid(reltup[1]))
+	relform1 = (Form_pg_class) GETSTRUCT(reltup1);
+
+	reltup2 = SearchSysCacheCopy(RELOID,
+								 ObjectIdGetDatum(r2),
+								 0, 0, 0);
+	if (!HeapTupleIsValid(reltup2))
 		elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r2);
+	relform2 = (Form_pg_class) GETSTRUCT(reltup2);

-	/* The buffer manager gets confused if we swap relfilenodes for
+	/*
+	 * The buffer manager gets confused if we swap relfilenodes for
 	 * relations that are not both local or non-local to this transaction.
 	 * Flush the buffers on both relations so the buffer manager can
-	 * forget about'em.
+	 * forget about'em.  (XXX this might not be necessary anymore?)
 	 */
-
-	rel = RelationIdGetRelation(r1);
+	rel = relation_open(r1, NoLock);
 	i = FlushRelationBuffers(rel, 0);
 	if (i < 0)
 		elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i);
-	RelationClose(rel);
-	rel = RelationIdGetRelation(r1);
+	relation_close(rel, NoLock);
+
+	rel = relation_open(r2, NoLock);
 	i = FlushRelationBuffers(rel, 0);
 	if (i < 0)
 		elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i);
-	RelationClose(rel);
+	relation_close(rel, NoLock);
+
+	/*
+	 * Actually swap the filenode and TOAST fields in the two tuples
+	 */
+	swaptemp = relform1->relfilenode;
+	relform1->relfilenode = relform2->relfilenode;
+	relform2->relfilenode = swaptemp;

-	/* Actually swap the filenodes */
+	swaptemp = relform1->reltoastrelid;
+	relform1->reltoastrelid = relform2->reltoastrelid;
+	relform2->reltoastrelid = swaptemp;

-	tempRFNode = ((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode;
-	((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode =
-		((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode;
-	((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode = tempRFNode;
+	/* we should not swap reltoastidxid */

-	/* Update the RelationRelationName tuples */
-	simple_heap_update(relRelation, &reltup[1]->t_self, reltup[1]);
-	simple_heap_update(relRelation, &reltup[0]->t_self, reltup[0]);
+	/* Update the tuples in pg_class */
+	simple_heap_update(relRelation, &reltup1->t_self, reltup1);
+	simple_heap_update(relRelation, &reltup2->t_self, reltup2);
 	
-	/* To keep system catalogs current. */
+	/* Keep system catalogs current */
 	indstate = CatalogOpenIndexes(relRelation);
-	CatalogIndexInsert(indstate, reltup[1]);
-	CatalogIndexInsert(indstate, reltup[0]);
+	CatalogIndexInsert(indstate, reltup1);
+	CatalogIndexInsert(indstate, reltup2);
 	CatalogCloseIndexes(indstate);

-	heap_close(relRelation, NoLock);
-	heap_freetuple(reltup[0]);
-	heap_freetuple(reltup[1]);
+	/*
+	 * If we have toast tables associated with the relations being swapped,
+	 * change their dependency links to re-associate them with their new
+	 * owning relations.  Otherwise the wrong one will get dropped ...
+	 *
+	 * NOTE: for now, we can assume the new table will have a TOAST table
+	 * if and only if the old one does.  This logic might need work if we
+	 * get smarter about dropped columns.
+	 *
+	 * NOTE: at present, a TOAST table's only dependency is the one on
+	 * its owning table.  If more are ever created, we'd need to use something
+	 * more selective than deleteDependencyRecordsFor() to get rid of only
+	 * the link we want.
+	 */
+	if (relform1->reltoastrelid || relform2->reltoastrelid)
+	{
+		ObjectAddress baseobject,
+					toastobject;
+		long		count;
+
+		if (!(relform1->reltoastrelid && relform2->reltoastrelid))
+			elog(ERROR, "CLUSTER: expected both swapped tables to have TOAST tables");
+
+		/* Delete old dependencies */
+		count = deleteDependencyRecordsFor(RelOid_pg_class,
+										   relform1->reltoastrelid);
+		if (count != 1)
+			elog(ERROR, "CLUSTER: expected one dependency record for TOAST table, found %ld",
+				 count);
+		count = deleteDependencyRecordsFor(RelOid_pg_class,
+										   relform2->reltoastrelid);
+		if (count != 1)
+			elog(ERROR, "CLUSTER: expected one dependency record for TOAST table, found %ld",
+				 count);
+
+		/* Register new dependencies */
+		baseobject.classId = RelOid_pg_class;
+		baseobject.objectId = r1;
+		baseobject.objectSubId = 0;
+		toastobject.classId = RelOid_pg_class;
+		toastobject.objectId = relform1->reltoastrelid;
+		toastobject.objectSubId = 0;
+
+		recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
+
+		baseobject.objectId = r2;
+		toastobject.objectId = relform2->reltoastrelid;
+
+		recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL);
+	}
+
+	/*
+	 * Blow away the old relcache entries now.  We need this kluge because
+	 * relcache.c indexes relcache entries by rd_node as well as OID.
+	 * It will get confused if it is asked to (re)build an entry with a new
+	 * rd_node value when there is still another entry laying about with
+	 * that same rd_node value.  (Fortunately, since one of the entries
+	 * is local in our transaction, it's sufficient to clear out our own
+	 * relcache this way; the problem cannot arise for other backends when
+	 * they see our update on the non-local relation.)
+	 */
+	RelationForgetRelation(r1);
+	RelationForgetRelation(r2);
+
+	/* Clean up. */
+	heap_freetuple(reltup1);
+	heap_freetuple(reltup2);
+
+	heap_close(relRelation, RowExclusiveLock);
 }
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.128 2002/08/06 02:36:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.129 2002/08/11 21:17:34 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -1038,11 +1038,6 @@ BufferReplace(BufferDesc *bufHdr)
 * RelationGetNumberOfBlocks
 *		Determines the current number of pages in the relation.
 *		Side effect: relation->rd_nblocks is updated.
- *
- * Note:
- *		XXX may fail for huge relations.
- *		XXX should be elsewhere.
- *		XXX maybe should be hidden
 */
 BlockNumber
 RelationGetNumberOfBlocks(Relation relation)
@ -1061,6 +1056,23 @@ RelationGetNumberOfBlocks(Relation relation)
 	return relation->rd_nblocks;
 }

+/*
+ * RelationUpdateNumberOfBlocks
+ *		Forcibly update relation->rd_nblocks.
+ *
+ * If the relcache drops an entry for a temp relation, it must call this
+ * routine after recreating the relcache entry, so that rd_nblocks is
+ * re-sync'd with reality.  See RelationGetNumberOfBlocks.
+ */
+void
+RelationUpdateNumberOfBlocks(Relation relation)
+{
+	if (relation->rd_rel->relkind == RELKIND_VIEW)
+		relation->rd_nblocks = 0;
+	else
+		relation->rd_nblocks = smgrnblocks(DEFAULT_SMGR, relation);
+}
+
 /* ---------------------------------------------------------------------
 *		DropRelationBuffers
 *
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.171 2002/08/06 02:36:35 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/cache/relcache.c,v 1.172 2002/08/11 21:17:35 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -860,11 +860,12 @@ RelationBuildDesc(RelationBuildDescInfo buildinfo,

 	/*
 	 * normal relations are not nailed into the cache; nor can a pre-existing
-	 * relation be new or temp.
+	 * relation be new.  It could be temp though.  (Actually, it could be new
+	 * too, but it's okay to forget that fact if forced to flush the entry.)
 	 */
 	relation->rd_isnailed = false;
 	relation->rd_isnew = false;
-	relation->rd_istemp = false;
+	relation->rd_istemp = isTempNamespace(relation->rd_rel->relnamespace);

 	/*
 	 * initialize the tuple descriptor (relation->rd_att).
@ -909,13 +910,23 @@ RelationBuildDesc(RelationBuildDescInfo buildinfo,
 	relation->rd_fd = -1;

 	/*
-	 * insert newly created relation into proper relcaches, restore memory
-	 * context and return the new reldesc.
+	 * Insert newly created relation into relcache hash tables.
 	 */
 	oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
 	RelationCacheInsert(relation);
 	MemoryContextSwitchTo(oldcxt);

+	/*
+	 * If it's a temp rel, RelationGetNumberOfBlocks will assume that
+	 * rd_nblocks is correct.  Must forcibly update the block count when
+	 * creating the relcache entry.  But if we are doing a rebuild, don't
+	 * do this yet; leave it to RelationClearRelation to do at the end.
+	 * (Otherwise, an elog in RelationUpdateNumberOfBlocks would leave us
+	 * with inconsistent relcache state.)
+	 */
+	if (relation->rd_istemp && oldrelation == NULL)
+		RelationUpdateNumberOfBlocks(relation);
+
 	return relation;
 }

@ -1601,8 +1612,7 @@ RelationClose(Relation relation)

 #ifdef RELCACHE_FORCE_RELEASE
 	if (RelationHasReferenceCountZero(relation) &&
-		!relation->rd_isnew &&
-		!relation->rd_istemp)
+		!relation->rd_isnew)
 		RelationClearRelation(relation, false);
 #endif
 }
@ -1733,19 +1743,15 @@ RelationClearRelation(Relation relation, bool rebuild)
 	{
 		/*
 		 * When rebuilding an open relcache entry, must preserve ref count
-		 * and new/temp flags.  Also attempt to preserve the tupledesc,
-		 * rewrite rules, and trigger substructures in place. Furthermore
-		 * we save/restore rd_nblocks (in case it is a new/temp relation)
-		 * *and* call RelationGetNumberOfBlocks (in case it isn't).
+		 * and rd_isnew flag.  Also attempt to preserve the tupledesc,
+		 * rewrite rules, and trigger substructures in place.
 		 */
 		int			old_refcnt = relation->rd_refcnt;
 		bool		old_isnew = relation->rd_isnew;
-		bool		old_istemp = relation->rd_istemp;
 		TupleDesc	old_att = relation->rd_att;
 		RuleLock   *old_rules = relation->rd_rules;
 		MemoryContext old_rulescxt = relation->rd_rulescxt;
 		TriggerDesc *old_trigdesc = relation->trigdesc;
-		BlockNumber old_nblocks = relation->rd_nblocks;
 		RelationBuildDescInfo buildinfo;

 		buildinfo.infotype = INFO_RELID;
@ -1764,7 +1770,6 @@ RelationClearRelation(Relation relation, bool rebuild)
 		}
 		RelationSetReferenceCount(relation, old_refcnt);
 		relation->rd_isnew = old_isnew;
-		relation->rd_istemp = old_istemp;
 		if (equalTupleDescs(old_att, relation->rd_att))
 		{
 			FreeTupleDesc(relation->rd_att);
@ -1791,13 +1796,14 @@ RelationClearRelation(Relation relation, bool rebuild)
 		}
 		else
 			FreeTriggerDesc(old_trigdesc);
-		relation->rd_nblocks = old_nblocks;

 		/*
-		 * this is kind of expensive, but I think we must do it in case
-		 * relation has been truncated...
+		 * Update rd_nblocks.  This is kind of expensive, but I think we must
+		 * do it in case relation has been truncated... we definitely must
+		 * do it if the rel is new or temp, since RelationGetNumberOfBlocks
+		 * will subsequently assume that the block count is correct.
 		 */
-		relation->rd_nblocks = RelationGetNumberOfBlocks(relation);
+		RelationUpdateNumberOfBlocks(relation);
 	}
 }

@ -1811,18 +1817,19 @@ RelationFlushRelation(Relation relation)
 {
 	bool		rebuild;

-	if (relation->rd_isnew || relation->rd_istemp)
+	if (relation->rd_isnew)
 	{
 		/*
-		 * New and temp relcache entries must always be rebuilt, not
-		 * flushed; else we'd forget those two important status bits.
+		 * New relcache entries are always rebuilt, not flushed; else we'd
+		 * forget the "new" status of the relation, which is a useful
+		 * optimization to have.
 		 */
 		rebuild = true;
 	}
 	else
 	{
 		/*
-		 * Nonlocal rels can be dropped from the relcache if not open.
+		 * Pre-existing rels can be dropped from the relcache if not open.
 		 */
 		rebuild = !RelationHasReferenceCountZero(relation);
 	}
@ -1921,7 +1928,7 @@ RelationCacheInvalidate(void)

 		relcacheInvalsReceived++;

-		if (RelationHasReferenceCountZero(relation) && !relation->rd_istemp)
+		if (RelationHasReferenceCountZero(relation))
 		{
 			/* Delete this entry immediately */
 			RelationClearRelation(relation, false);
@ -1965,7 +1972,10 @@ AtEOXact_RelationCache(bool commit)
 		 *
 		 * During commit, reset the flag to false, since we are now out of the
 		 * creating transaction.  During abort, simply delete the relcache
-		 * entry --- it isn't interesting any longer.
+		 * entry --- it isn't interesting any longer.  (NOTE: if we have
+		 * forgotten the isnew state of a new relation due to a forced cache
+		 * flush, the entry will get deleted anyway by shared-cache-inval
+		 * processing of the aborted pg_class insertion.)
 		 */
 		if (relation->rd_isnew)
 		{
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: dependency.h,v 1.3 2002/07/16 22:12:20 tgl Exp $
+ * $Id: dependency.h,v 1.4 2002/08/11 21:17:35 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -98,6 +98,6 @@ extern void recordMultipleDependencies(const ObjectAddress *depender,
 									   int nreferenced,
 									   DependencyType behavior);

-extern void deleteDependencyRecordsFor(Oid classId, Oid objectId);
+extern long deleteDependencyRecordsFor(Oid classId, Oid objectId);

 #endif   /* DEPENDENCY_H */
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: bufmgr.h,v 1.62 2002/08/06 02:36:35 tgl Exp $
+ * $Id: bufmgr.h,v 1.63 2002/08/11 21:17:35 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -161,6 +161,7 @@ extern void AtEOXact_Buffers(bool isCommit);
 extern void FlushBufferPool(void);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
+extern void RelationUpdateNumberOfBlocks(Relation relation);
 extern int	FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
 extern void DropRelationBuffers(Relation rel);
 extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: rel.h,v 1.61 2002/08/06 02:36:35 tgl Exp $
+ * $Id: rel.h,v 1.62 2002/08/11 21:17:35 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -113,6 +113,10 @@ typedef struct RelationData
 								 * InvalidBlockNumber */
 	int			rd_refcnt;		/* reference count */
 	bool		rd_isnew;		/* rel was created in current xact */
+	/*
+	 * NOTE: rd_isnew should be relied on only for optimization purposes;
+	 * it is possible for new-ness to be "forgotten" (eg, after CLUSTER).
+	 */
 	bool		rd_istemp;		/* rel uses the local buffer mgr */
 	bool		rd_isnailed;	/* rel is nailed in cache */
 	bool		rd_indexfound;	/* true if rd_indexlist is valid */
--- a/src/test/regress/expected/cluster.out
+++ b/src/test/regress/expected/cluster.out
@ -8,6 +8,7 @@ NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index 'clstr_tst_s_pkey
 CREATE TABLE clstr_tst (a SERIAL PRIMARY KEY,
 	b INT,
 	c TEXT,
+	d TEXT,
 	CONSTRAINT clstr_tst_con FOREIGN KEY (b) REFERENCES clstr_tst_s);
 NOTICE:  CREATE TABLE will create implicit sequence 'clstr_tst_a_seq' for SERIAL column 'clstr_tst.a'
 NOTICE:  CREATE TABLE / PRIMARY KEY will create implicit index 'clstr_tst_pkey' for table 'clstr_tst'
@ -54,220 +55,222 @@ INSERT INTO clstr_tst (b, c) VALUES (15, 'quince');
 INSERT INTO clstr_tst (b, c) VALUES (7, 'siete');
 INSERT INTO clstr_tst (b, c) VALUES (16, 'dieciseis');
 INSERT INTO clstr_tst (b, c) VALUES (8, 'ocho');
-INSERT INTO clstr_tst (b, c) VALUES (6, 'seis');
+-- This entry is needed to test that TOASTED values are copied correctly.
+INSERT INTO clstr_tst (b, c, d) VALUES (6, 'seis', repeat('xyzzy', 100000));
 CLUSTER clstr_tst_c ON clstr_tst;
-SELECT * from clstr_tst;
- a  | b  |       c       
----+----+---------------
- 10 | 14 | catorce
- 18 |  5 | cinco
-  9 |  4 | cuatro
- 26 | 19 | diecinueve
- 12 | 18 | dieciocho
- 30 | 16 | dieciseis
- 24 | 17 | diecisiete
-  2 | 10 | diez
- 23 | 12 | doce
- 11 |  2 | dos
- 25 |  9 | nueve
- 31 |  8 | ocho
-  1 | 11 | once
- 28 | 15 | quince
- 32 |  6 | seis
- 29 |  7 | siete
- 15 | 13 | trece
- 22 | 30 | treinta
- 17 | 32 | treinta y dos
-  3 | 31 | treinta y uno
-  5 |  3 | tres
- 20 |  1 | uno
-  6 | 20 | veinte
- 14 | 25 | veinticinco
- 21 | 24 | veinticuatro
-  4 | 22 | veintidos
- 19 | 29 | veintinueve
- 16 | 28 | veintiocho
- 27 | 26 | veintiseis
- 13 | 27 | veintisiete
-  7 | 23 | veintitres
-  8 | 21 | veintiuno
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 10 | 14 | catorce       |                                |       
+ 18 |  5 | cinco         |                                |       
+  9 |  4 | cuatro        |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+  2 | 10 | diez          |                                |       
+ 23 | 12 | doce          |                                |       
+ 11 |  2 | dos           |                                |       
+ 25 |  9 | nueve         |                                |       
+ 31 |  8 | ocho          |                                |       
+  1 | 11 | once          |                                |       
+ 28 | 15 | quince        |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 15 | 13 | trece         |                                |       
+ 22 | 30 | treinta       |                                |       
+ 17 | 32 | treinta y dos |                                |       
+  3 | 31 | treinta y uno |                                |       
+  5 |  3 | tres          |                                |       
+ 20 |  1 | uno           |                                |       
+  6 | 20 | veinte        |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+  4 | 22 | veintidos     |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
 (32 rows)

-SELECT * from clstr_tst ORDER BY a;
- a  | b  |       c       
----+----+---------------
-  1 | 11 | once
-  2 | 10 | diez
-  3 | 31 | treinta y uno
-  4 | 22 | veintidos
-  5 |  3 | tres
-  6 | 20 | veinte
-  7 | 23 | veintitres
-  8 | 21 | veintiuno
-  9 |  4 | cuatro
- 10 | 14 | catorce
- 11 |  2 | dos
- 12 | 18 | dieciocho
- 13 | 27 | veintisiete
- 14 | 25 | veinticinco
- 15 | 13 | trece
- 16 | 28 | veintiocho
- 17 | 32 | treinta y dos
- 18 |  5 | cinco
- 19 | 29 | veintinueve
- 20 |  1 | uno
- 21 | 24 | veinticuatro
- 22 | 30 | treinta
- 23 | 12 | doce
- 24 | 17 | diecisiete
- 25 |  9 | nueve
- 26 | 19 | diecinueve
- 27 | 26 | veintiseis
- 28 | 15 | quince
- 29 |  7 | siete
- 30 | 16 | dieciseis
- 31 |  8 | ocho
- 32 |  6 | seis
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY a;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+  1 | 11 | once          |                                |       
+  2 | 10 | diez          |                                |       
+  3 | 31 | treinta y uno |                                |       
+  4 | 22 | veintidos     |                                |       
+  5 |  3 | tres          |                                |       
+  6 | 20 | veinte        |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
+  9 |  4 | cuatro        |                                |       
+ 10 | 14 | catorce       |                                |       
+ 11 |  2 | dos           |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 13 | 27 | veintisiete   |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 15 | 13 | trece         |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 17 | 32 | treinta y dos |                                |       
+ 18 |  5 | cinco         |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 20 |  1 | uno           |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+ 22 | 30 | treinta       |                                |       
+ 23 | 12 | doce          |                                |       
+ 24 | 17 | diecisiete    |                                |       
+ 25 |  9 | nueve         |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 28 | 15 | quince        |                                |       
+ 29 |  7 | siete         |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 31 |  8 | ocho          |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
 (32 rows)

-SELECT * from clstr_tst ORDER BY b;
- a  | b  |       c       
----+----+---------------
- 20 |  1 | uno
- 11 |  2 | dos
-  5 |  3 | tres
-  9 |  4 | cuatro
- 18 |  5 | cinco
- 32 |  6 | seis
- 29 |  7 | siete
- 31 |  8 | ocho
- 25 |  9 | nueve
-  2 | 10 | diez
-  1 | 11 | once
- 23 | 12 | doce
- 15 | 13 | trece
- 10 | 14 | catorce
- 28 | 15 | quince
- 30 | 16 | dieciseis
- 24 | 17 | diecisiete
- 12 | 18 | dieciocho
- 26 | 19 | diecinueve
-  6 | 20 | veinte
-  8 | 21 | veintiuno
-  4 | 22 | veintidos
-  7 | 23 | veintitres
- 21 | 24 | veinticuatro
- 14 | 25 | veinticinco
- 27 | 26 | veintiseis
- 13 | 27 | veintisiete
- 16 | 28 | veintiocho
- 19 | 29 | veintinueve
- 22 | 30 | treinta
-  3 | 31 | treinta y uno
- 17 | 32 | treinta y dos
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY b;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 20 |  1 | uno           |                                |       
+ 11 |  2 | dos           |                                |       
+  5 |  3 | tres          |                                |       
+  9 |  4 | cuatro        |                                |       
+ 18 |  5 | cinco         |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 31 |  8 | ocho          |                                |       
+ 25 |  9 | nueve         |                                |       
+  2 | 10 | diez          |                                |       
+  1 | 11 | once          |                                |       
+ 23 | 12 | doce          |                                |       
+ 15 | 13 | trece         |                                |       
+ 10 | 14 | catorce       |                                |       
+ 28 | 15 | quince        |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 26 | 19 | diecinueve    |                                |       
+  6 | 20 | veinte        |                                |       
+  8 | 21 | veintiuno     |                                |       
+  4 | 22 | veintidos     |                                |       
+  7 | 23 | veintitres    |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 22 | 30 | treinta       |                                |       
+  3 | 31 | treinta y uno |                                |       
+ 17 | 32 | treinta y dos |                                |       
 (32 rows)

-SELECT * from clstr_tst ORDER BY c;
- a  | b  |       c       
----+----+---------------
- 10 | 14 | catorce
- 18 |  5 | cinco
-  9 |  4 | cuatro
- 26 | 19 | diecinueve
- 12 | 18 | dieciocho
- 30 | 16 | dieciseis
- 24 | 17 | diecisiete
-  2 | 10 | diez
- 23 | 12 | doce
- 11 |  2 | dos
- 25 |  9 | nueve
- 31 |  8 | ocho
-  1 | 11 | once
- 28 | 15 | quince
- 32 |  6 | seis
- 29 |  7 | siete
- 15 | 13 | trece
- 22 | 30 | treinta
- 17 | 32 | treinta y dos
-  3 | 31 | treinta y uno
-  5 |  3 | tres
- 20 |  1 | uno
-  6 | 20 | veinte
- 14 | 25 | veinticinco
- 21 | 24 | veinticuatro
-  4 | 22 | veintidos
- 19 | 29 | veintinueve
- 16 | 28 | veintiocho
- 27 | 26 | veintiseis
- 13 | 27 | veintisiete
-  7 | 23 | veintitres
-  8 | 21 | veintiuno
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY c;
+ a  | b  |       c       |           substring            | length 
+----+----+---------------+--------------------------------+--------
+ 10 | 14 | catorce       |                                |       
+ 18 |  5 | cinco         |                                |       
+  9 |  4 | cuatro        |                                |       
+ 26 | 19 | diecinueve    |                                |       
+ 12 | 18 | dieciocho     |                                |       
+ 30 | 16 | dieciseis     |                                |       
+ 24 | 17 | diecisiete    |                                |       
+  2 | 10 | diez          |                                |       
+ 23 | 12 | doce          |                                |       
+ 11 |  2 | dos           |                                |       
+ 25 |  9 | nueve         |                                |       
+ 31 |  8 | ocho          |                                |       
+  1 | 11 | once          |                                |       
+ 28 | 15 | quince        |                                |       
+ 32 |  6 | seis          | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |  7 | siete         |                                |       
+ 15 | 13 | trece         |                                |       
+ 22 | 30 | treinta       |                                |       
+ 17 | 32 | treinta y dos |                                |       
+  3 | 31 | treinta y uno |                                |       
+  5 |  3 | tres          |                                |       
+ 20 |  1 | uno           |                                |       
+  6 | 20 | veinte        |                                |       
+ 14 | 25 | veinticinco   |                                |       
+ 21 | 24 | veinticuatro  |                                |       
+  4 | 22 | veintidos     |                                |       
+ 19 | 29 | veintinueve   |                                |       
+ 16 | 28 | veintiocho    |                                |       
+ 27 | 26 | veintiseis    |                                |       
+ 13 | 27 | veintisiete   |                                |       
+  7 | 23 | veintitres    |                                |       
+  8 | 21 | veintiuno     |                                |       
 (32 rows)

 -- Verify that inheritance link still works
 INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table');
-SELECT * from clstr_tst;
- a  |  b  |       c        
----+-----+----------------
- 10 |  14 | catorce
- 18 |   5 | cinco
-  9 |   4 | cuatro
- 26 |  19 | diecinueve
- 12 |  18 | dieciocho
- 30 |  16 | dieciseis
- 24 |  17 | diecisiete
-  2 |  10 | diez
- 23 |  12 | doce
- 11 |   2 | dos
- 25 |   9 | nueve
- 31 |   8 | ocho
-  1 |  11 | once
- 28 |  15 | quince
- 32 |   6 | seis
- 29 |   7 | siete
- 15 |  13 | trece
- 22 |  30 | treinta
- 17 |  32 | treinta y dos
-  3 |  31 | treinta y uno
-  5 |   3 | tres
- 20 |   1 | uno
-  6 |  20 | veinte
- 14 |  25 | veinticinco
- 21 |  24 | veinticuatro
-  4 |  22 | veintidos
- 19 |  29 | veintinueve
- 16 |  28 | veintiocho
- 27 |  26 | veintiseis
- 13 |  27 | veintisiete
-  7 |  23 | veintitres
-  8 |  21 | veintiuno
-  0 | 100 | in child table
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;
+ a  |  b  |       c        |           substring            | length 
+----+-----+----------------+--------------------------------+--------
+ 10 |  14 | catorce        |                                |       
+ 18 |   5 | cinco          |                                |       
+  9 |   4 | cuatro         |                                |       
+ 26 |  19 | diecinueve     |                                |       
+ 12 |  18 | dieciocho      |                                |       
+ 30 |  16 | dieciseis      |                                |       
+ 24 |  17 | diecisiete     |                                |       
+  2 |  10 | diez           |                                |       
+ 23 |  12 | doce           |                                |       
+ 11 |   2 | dos            |                                |       
+ 25 |   9 | nueve          |                                |       
+ 31 |   8 | ocho           |                                |       
+  1 |  11 | once           |                                |       
+ 28 |  15 | quince         |                                |       
+ 32 |   6 | seis           | xyzzyxyzzyxyzzyxyzzyxyzzyxyzzy | 500000
+ 29 |   7 | siete          |                                |       
+ 15 |  13 | trece          |                                |       
+ 22 |  30 | treinta        |                                |       
+ 17 |  32 | treinta y dos  |                                |       
+  3 |  31 | treinta y uno  |                                |       
+  5 |   3 | tres           |                                |       
+ 20 |   1 | uno            |                                |       
+  6 |  20 | veinte         |                                |       
+ 14 |  25 | veinticinco    |                                |       
+ 21 |  24 | veinticuatro   |                                |       
+  4 |  22 | veintidos      |                                |       
+ 19 |  29 | veintinueve    |                                |       
+ 16 |  28 | veintiocho     |                                |       
+ 27 |  26 | veintiseis     |                                |       
+ 13 |  27 | veintisiete    |                                |       
+  7 |  23 | veintitres     |                                |       
+  8 |  21 | veintiuno      |                                |       
+  0 | 100 | in child table |                                |       
 (33 rows)

 -- Verify that foreign key link still works
 INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail');
 ERROR:  clstr_tst_con referential integrity violation - key referenced from clstr_tst not found in clstr_tst_s
-SELECT conname FROM pg_constraint WHERE conrelid=(SELECT oid FROM pg_class
-	WHERE relname='clstr_tst');
+SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass;
    conname     
 ----------------
 clstr_tst_pkey
 clstr_tst_con
 (2 rows)

-SELECT relname FROM pg_class WHERE relname LIKE 'clstr_tst%' ORDER BY relname;
-       relname        
----------------------
- clstr_tst
- clstr_tst_a_seq
- clstr_tst_b
- clstr_tst_b_c
- clstr_tst_c
- clstr_tst_c_b
- clstr_tst_inh
- clstr_tst_pkey
- clstr_tst_s
- clstr_tst_s_pkey
- clstr_tst_s_rf_a_seq
+SELECT relname, relkind,
+    EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast
+FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname;
+       relname        | relkind | hastoast 
+----------------------+---------+----------
+ clstr_tst            | r       | t
+ clstr_tst_a_seq      | S       | f
+ clstr_tst_b          | i       | f
+ clstr_tst_b_c        | i       | f
+ clstr_tst_c          | i       | f
+ clstr_tst_c_b        | i       | f
+ clstr_tst_inh        | r       | t
+ clstr_tst_pkey       | i       | f
+ clstr_tst_s          | r       | f
+ clstr_tst_s_pkey     | i       | f
+ clstr_tst_s_rf_a_seq | S       | f
 (11 rows)

--- a/src/test/regress/sql/cluster.sql
+++ b/src/test/regress/sql/cluster.sql
@ -8,6 +8,7 @@ CREATE TABLE clstr_tst_s (rf_a SERIAL PRIMARY KEY,
 CREATE TABLE clstr_tst (a SERIAL PRIMARY KEY,
 	b INT,
 	c TEXT,
+	d TEXT,
 	CONSTRAINT clstr_tst_con FOREIGN KEY (b) REFERENCES clstr_tst_s);

 CREATE INDEX clstr_tst_b ON clstr_tst (b);
@ -55,24 +56,26 @@ INSERT INTO clstr_tst (b, c) VALUES (15, 'quince');
 INSERT INTO clstr_tst (b, c) VALUES (7, 'siete');
 INSERT INTO clstr_tst (b, c) VALUES (16, 'dieciseis');
 INSERT INTO clstr_tst (b, c) VALUES (8, 'ocho');
-INSERT INTO clstr_tst (b, c) VALUES (6, 'seis');
+-- This entry is needed to test that TOASTED values are copied correctly.
+INSERT INTO clstr_tst (b, c, d) VALUES (6, 'seis', repeat('xyzzy', 100000));

 CLUSTER clstr_tst_c ON clstr_tst;

-SELECT * from clstr_tst;
-SELECT * from clstr_tst ORDER BY a;
-SELECT * from clstr_tst ORDER BY b;
-SELECT * from clstr_tst ORDER BY c;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY a;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY b;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst ORDER BY c;

 -- Verify that inheritance link still works
 INSERT INTO clstr_tst_inh VALUES (0, 100, 'in child table');
-SELECT * from clstr_tst;
+SELECT a,b,c,substring(d for 30), length(d) from clstr_tst;

 -- Verify that foreign key link still works
 INSERT INTO clstr_tst (b, c) VALUES (1111, 'this should fail');

-SELECT conname FROM pg_constraint WHERE conrelid=(SELECT oid FROM pg_class
-	WHERE relname='clstr_tst');
+SELECT conname FROM pg_constraint WHERE conrelid = 'clstr_tst'::regclass;


-SELECT relname FROM pg_class WHERE relname LIKE 'clstr_tst%' ORDER BY relname;
+SELECT relname, relkind,
+    EXISTS(SELECT 1 FROM pg_class WHERE oid = c.reltoastrelid) AS hastoast
+FROM pg_class c WHERE relname LIKE 'clstr_tst%' ORDER BY relname;