postgres/src/backend/commands/cluster.c

/*-------------------------------------------------------------------------
 *
 * cluster.c
 *	  Paul Brown's implementation of cluster index.
 *
 *	  I am going to use the rename function as a model for this in the
 *	  parser and executor, and the vacuum code as an example in this
 *	  file. As I go - in contrast to the rest of postgres - there will
 *	  be BUCKETS of comments. This is to allow reviewers to understand
 *	  my (probably bogus) assumptions about the way this works.
 *														[pbrown '94]
 *
 * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994-5, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $Header: /cvsroot/pgsql/src/backend/commands/cluster.c,v 1.84 2002/08/10 20:43:46 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/genam.h"
#include "access/heapam.h"
#include "catalog/dependency.h"
#include "catalog/heap.h"
#include "catalog/index.h"
#include "catalog/indexing.h"
#include "catalog/catname.h"
#include "catalog/pg_index.h"
#include "catalog/pg_proc.h"
#include "commands/cluster.h"
#include "commands/tablecmds.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#include "utils/relcache.h"

/*
 * We need one of these structs for each index in the relation to be
 * clustered.  It's basically the data needed by index_create() so
 * we can recreate the indexes after destroying the old heap.
 */
typedef struct
{
	char	   *indexName;
	IndexInfo  *indexInfo;
	Oid			accessMethodOID;
	Oid		   *classOID;
	Oid			indexOID;
	bool		isPrimary;
} IndexAttrs;

static Oid	copy_heap(Oid OIDOldHeap, const char *NewName);
static void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex);
static List *get_indexattr_list (Oid OIDOldHeap);
static void recreate_indexattr(Oid OIDOldHeap, List *indexes);
static void swap_relfilenodes(Oid r1, Oid r2);

Relation RelationIdGetRelation(Oid relationId);

/*
 * cluster
 *
 * This clusters the table by creating a new, clustered table and
 * swapping the relfilenodes of the new table and the old table, so
 * the OID of the original table is preserved.  Thus we do not lose
 * GRANT, inheritance nor references to this table (this was a bug
 * in releases thru 7.3)
 *
 * Also create new indexes and swap the filenodes with the old indexes
 * the same way we do for the relation.
 *
 * TODO:
 * 		maybe we can get away with AccessShareLock for the table.
 * 		Concurrency would be much improved.  Only acquire
 * 		AccessExclusiveLock right before swapping the filenodes.
 * 		This would allow users to CLUSTER on a regular basis,
 * 		practically eliminating the need for auto-clustered indexes.
 *
 *		Preserve constraint bit for the indexes.
 */
void
cluster(RangeVar *oldrelation, char *oldindexname)
{
	Oid			OIDOldHeap,
				OIDOldIndex,
				OIDNewHeap;
	Relation	OldHeap,
				OldIndex;
	char		NewHeapName[NAMEDATALEN];
	ObjectAddress object;
	List	   *indexes;

	/*
	 * We grab exclusive access to the target rel and index for the
	 * duration of the transaction.
	 */
	OldHeap = heap_openrv(oldrelation, AccessExclusiveLock);
	OIDOldHeap = RelationGetRelid(OldHeap);

	/*
	 * The index is expected to be in the same namespace as the relation.
	 */
	OIDOldIndex = get_relname_relid(oldindexname,
									RelationGetNamespace(OldHeap));
	if (!OidIsValid(OIDOldIndex))
		elog(ERROR, "CLUSTER: cannot find index \"%s\" for table \"%s\"",
			 oldindexname, oldrelation->relname);
	OldIndex = index_open(OIDOldIndex);
	LockRelation(OldIndex, AccessExclusiveLock);

	/*
	 * Check that index is in fact an index on the given relation
	 */
	if (OldIndex->rd_index->indrelid != OIDOldHeap)
		elog(ERROR, "CLUSTER: \"%s\" is not an index for table \"%s\"",
			 oldindexname, oldrelation->relname);

	/* Drop relcache refcnts, but do NOT give up the locks */
	heap_close(OldHeap, NoLock);
	index_close(OldIndex);

	/* Save the information of all indexes on the relation. */
	indexes = get_indexattr_list(OIDOldHeap);

	/*
	 * Create the new heap with a temporary name.
	 */
	snprintf(NewHeapName, NAMEDATALEN, "temp_%u", OIDOldHeap);

	OIDNewHeap = copy_heap(OIDOldHeap, NewHeapName);

	/* We do not need CommandCounterIncrement() because copy_heap did it. */

	/*
	 * Copy the heap data into the new table in the desired order.
	 */
	rebuildheap(OIDNewHeap, OIDOldHeap, OIDOldIndex);

	/* To make the new heap's data visible. */
	CommandCounterIncrement();

	/* Swap the relfilenodes of the old and new heaps. */
	swap_relfilenodes(OIDNewHeap, OIDOldHeap);

	CommandCounterIncrement();

	/* Destroy new heap with old filenode */
	object.classId = RelOid_pg_class;
	object.objectId = OIDNewHeap;
	object.objectSubId = 0;

	/* The relation is local to our transaction and we know nothin
	 * depends on it, so DROP_RESTRICT should be OK.
	 */
	performDeletion(&object, DROP_RESTRICT);

	/* performDeletion does CommandCounterIncrement at end */

 	/* Recreate the indexes on the relation.  We do not need
  	 * CommandCounterIncrement() because recreate_indexattr does it.
   	 */
  	recreate_indexattr(OIDOldHeap, indexes);
}

static Oid
copy_heap(Oid OIDOldHeap, const char *NewName)
{
	TupleDesc	OldHeapDesc,
				tupdesc;
	Oid			OIDNewHeap;
	Relation	OldHeap;

	OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
	OldHeapDesc = RelationGetDescr(OldHeap);

	/*
	 * Need to make a copy of the tuple descriptor, since
	 * heap_create_with_catalog modifies it.
	 */
	tupdesc = CreateTupleDescCopyConstr(OldHeapDesc);

	OIDNewHeap = heap_create_with_catalog(NewName,
										  RelationGetNamespace(OldHeap),
										  tupdesc,
										  OldHeap->rd_rel->relkind,
										  OldHeap->rd_rel->relisshared,
										  OldHeap->rd_rel->relhasoids,
										  allowSystemTableMods);

	/*
	 * Advance command counter so that the newly-created relation's
	 * catalog tuples will be visible to heap_open.
	 */
	CommandCounterIncrement();

	/*
	 * If necessary, create a TOAST table for the new relation. Note that
	 * AlterTableCreateToastTable ends with CommandCounterIncrement(), so
	 * that the TOAST table will be visible for insertion.
	 */
	AlterTableCreateToastTable(OIDNewHeap, true);

	heap_close(OldHeap, NoLock);

	return OIDNewHeap;
}

static void
rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex)
{
	Relation	LocalNewHeap,
				LocalOldHeap,
				LocalOldIndex;
	IndexScanDesc ScanDesc;
	HeapTuple	LocalHeapTuple;

	/*
	 * Open the relations I need. Scan through the OldHeap on the OldIndex
	 * and insert each tuple into the NewHeap.
	 */
	LocalNewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
	LocalOldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
	LocalOldIndex = index_open(OIDOldIndex);

	ScanDesc = index_beginscan(LocalOldHeap, LocalOldIndex,
							   SnapshotNow, 0, (ScanKey) NULL);

	while ((LocalHeapTuple = index_getnext(ScanDesc, ForwardScanDirection)) != NULL)
	{
		/*
		 * We must copy the tuple because heap_insert() will overwrite
		 * the commit-status fields of the tuple it's handed, and the
		 * retrieved tuple will actually be in a disk buffer!  Thus,
		 * the source relation would get trashed, which is bad news if
		 * we abort later on.  (This was a bug in releases thru 7.0)
		 */
		HeapTuple	copiedTuple = heap_copytuple(LocalHeapTuple);

		simple_heap_insert(LocalNewHeap, copiedTuple);
		heap_freetuple(copiedTuple);

		CHECK_FOR_INTERRUPTS();
	}

	index_endscan(ScanDesc);

	index_close(LocalOldIndex);
	heap_close(LocalOldHeap, NoLock);
	heap_close(LocalNewHeap, NoLock);
}

/* Get the necessary info about the indexes in the relation and
 * return a List of IndexAttrs.
 */
List *
get_indexattr_list (Oid OIDOldHeap)
{
	ScanKeyData	entry;
	HeapScanDesc scan;
	Relation indexRelation;
	HeapTuple indexTuple;
	List *indexes = NIL;
	IndexAttrs *attrs;
	HeapTuple tuple;
	Form_pg_index index;
	
	/* Grab the index tuples by looking into RelationRelationName
	 * by the OID of the old heap.
	 */
	indexRelation = heap_openr(IndexRelationName, AccessShareLock);
	ScanKeyEntryInitialize(&entry, 0, Anum_pg_index_indrelid,
			F_OIDEQ, ObjectIdGetDatum(OIDOldHeap));
	scan = heap_beginscan(indexRelation, SnapshotNow, 1, &entry);
	while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		index = (Form_pg_index) GETSTRUCT(indexTuple);

		attrs = (IndexAttrs *) palloc(sizeof(IndexAttrs));
		attrs->indexInfo = BuildIndexInfo(index);
		attrs->isPrimary = index->indisprimary;
		attrs->indexOID = index->indexrelid;

		/* The opclasses are copied verbatim from the original indexes.
		*/
		attrs->classOID = (Oid *)palloc(sizeof(Oid) *
				attrs->indexInfo->ii_NumIndexAttrs);
		memcpy(attrs->classOID, index->indclass,
				sizeof(Oid) * attrs->indexInfo->ii_NumIndexAttrs);

		/* Name and access method of each index come from
		 * RelationRelationName.
		 */
		tuple = SearchSysCache(RELOID,
				ObjectIdGetDatum(attrs->indexOID),
				0, 0, 0);
		if (!HeapTupleIsValid(tuple))
			elog(ERROR, "CLUSTER: cannot find index %u", attrs->indexOID);
		attrs->indexName = pstrdup(NameStr(((Form_pg_class) GETSTRUCT(tuple))->relname));
		attrs->accessMethodOID = ((Form_pg_class) GETSTRUCT(tuple))->relam;
		ReleaseSysCache(tuple);

		/* Cons the gathered data into the list.  We do not care about
		 * ordering, and this is more efficient than append.
		 */
		indexes=lcons((void *)attrs, indexes);
	}
	heap_endscan(scan);
	heap_close(indexRelation, AccessShareLock);
	return indexes;
}

/* Create new indexes and swap the filenodes with old indexes.  Then drop
 * the new index (carrying the old heap along).
 */
void
recreate_indexattr(Oid OIDOldHeap, List *indexes)
{
	IndexAttrs *attrs;
	List 	   *elem;
	Oid			newIndexOID;
	char		newIndexName[NAMEDATALEN];
	ObjectAddress object;

	foreach (elem, indexes)
	{
		attrs=(IndexAttrs *) lfirst(elem);

		/* Create the new index under a temporary name */
		snprintf(newIndexName, NAMEDATALEN, "temp_%u", attrs->indexOID);

		/* The new index will have constraint status set to false,
		 * but since we will only use its filenode it doesn't matter:
		 * after the filenode swap the index will keep the constraint
		 * status of the old index.
		 */
		newIndexOID = index_create(OIDOldHeap, newIndexName,
								   attrs->indexInfo, attrs->accessMethodOID,
								   attrs->classOID, attrs->isPrimary,
								   false, allowSystemTableMods);
		CommandCounterIncrement();

		/* Swap the filenodes. */
		swap_relfilenodes(newIndexOID, attrs->indexOID);
		setRelhasindex(OIDOldHeap, true, attrs->isPrimary, InvalidOid);

		/* I'm not sure this one is needed, but let's be safe. */
		CommandCounterIncrement();

		/* Destroy new index with old filenode */
		object.classId = RelOid_pg_class;
		object.objectId = newIndexOID;
		object.objectSubId = 0;
		
		/* The relation is local to our transaction and we know
		 * nothing depends on it, so DROP_RESTRICT should be OK.
		 */
		performDeletion(&object, DROP_RESTRICT);
		
		/* performDeletion does CommandCounterIncrement() at its end */
		
		pfree(attrs->classOID);
		pfree(attrs);
	}
	freeList(indexes);
}

/* Swap the relfilenodes for two given relations.
 */
void
swap_relfilenodes(Oid r1, Oid r2)
{
	/* I can probably keep RelationRelationName open in the main
	 * function and pass the Relation around so I don't have to open
	 * it every time.
	 */
	Relation	relRelation,
				irels[Num_pg_class_indices],
				rel;
	HeapTuple	reltup[2];
	Oid			tempRFNode;
	int			i;

	/* We need both RelationRelationName tuples.  */
	relRelation = heap_openr(RelationRelationName, RowExclusiveLock);

	reltup[0] = SearchSysCacheCopy(RELOID,
								   ObjectIdGetDatum(r1),
								   0, 0, 0);
	if (!HeapTupleIsValid(reltup[0]))
		elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r1);
	reltup[1] = SearchSysCacheCopy(RELOID,
								   ObjectIdGetDatum(r2),
								   0, 0, 0);
	if (!HeapTupleIsValid(reltup[1]))
		elog(ERROR, "CLUSTER: Cannot find tuple for relation %u", r2);

	/* The buffer manager gets confused if we swap relfilenodes for
	 * relations that are not both local or non-local to this transaction.
	 * Flush the buffers on both relations so the buffer manager can
	 * forget about'em.
	 */

	rel = RelationIdGetRelation(r1);
	i = FlushRelationBuffers(rel, 0);
	if (i < 0)
		elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i);
	RelationClose(rel);
	rel = RelationIdGetRelation(r1);
	i = FlushRelationBuffers(rel, 0);
	if (i < 0)
		elog(ERROR, "CLUSTER: FlushRelationBuffers returned %d", i);
	RelationClose(rel);

	/* Actually swap the filenodes */

	tempRFNode = ((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode;
	((Form_pg_class) GETSTRUCT(reltup[0]))->relfilenode =
		((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode;
	((Form_pg_class) GETSTRUCT(reltup[1]))->relfilenode = tempRFNode;

	/* Update the RelationRelationName tuples */
	simple_heap_update(relRelation, &reltup[1]->t_self, reltup[1]);
	simple_heap_update(relRelation, &reltup[0]->t_self, reltup[0]);
	
	/* To keep system catalogs current. */
	CatalogOpenIndices(Num_pg_class_indices, Name_pg_class_indices, irels);
	CatalogIndexInsert(irels, Num_pg_class_indices, relRelation, reltup[1]);
	CatalogIndexInsert(irels, Num_pg_class_indices, relRelation, reltup[0]);
	CatalogCloseIndices(Num_pg_class_indices, irels);
	CommandCounterIncrement();

	heap_close(relRelation, NoLock);
	heap_freetuple(reltup[0]);
	heap_freetuple(reltup[1]);
}