diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index d74b3e853c6..4d88eba5e3a 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -6,8 +6,8 @@ OBJS = \ pg_buffercache_pages.o EXTENSION = pg_buffercache -DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ - pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql +DATA = pg_buffercache--1.0--1.1.sql pg_buffercache--1.1--1.2.sql pg_buffercache--1.2.sql \ + pg_buffercache--1.2--1.3.sql pg_buffercache--1.3--1.4.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache diff --git a/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql new file mode 100644 index 00000000000..50956b195a8 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql @@ -0,0 +1,30 @@ +/* contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.4'" to load this file. \quit + +/* First we have to remove them from the extension */ +ALTER EXTENSION pg_buffercache DROP VIEW pg_buffercache; +ALTER EXTENSION pg_buffercache DROP FUNCTION pg_buffercache_pages(); + +/* Then we can drop them */ +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +/* Now redefine */ +CREATE FUNCTION pg_buffercache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages_v1_4' +LANGUAGE C PARALLEL SAFE; + +CREATE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 8c060ae9abf..a82ae5f9bb5 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.3' +default_version = '1.4' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index c5754ea9fa5..a45f240499a 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -59,9 +59,10 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4); -Datum -pg_buffercache_pages(PG_FUNCTION_ARGS) +static Datum +pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid) { FuncCallContext *funcctx; Datum result; @@ -103,7 +104,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", - OIDOID, -1, 0); + rfn_typid, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", @@ -209,7 +210,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } else { - values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber); + if (rfn_typid == INT8OID) + values[1] = + Int64GetDatum((int64) fctx->record[i].relfilenumber); + else + { + Assert(rfn_typid == OIDOID); + + if (fctx->record[i].relfilenumber > OID_MAX) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("relfilenode %llu is too large to be represented as an OID", + (unsigned long long) fctx->record[i].relfilenumber), + errhint("Upgrade the extension using ALTER EXTENSION pg_buffercache UPDATE")); + + values[1] = + ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber); + } + nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; @@ -237,3 +255,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +/* entry point for old extension version */ +Datum +pg_buffercache_pages(PG_FUNCTION_ARGS) +{ + return pg_buffercache_pages_internal(fcinfo, OIDOID); +} + +Datum +pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS) +{ + return pg_buffercache_pages_internal(fcinfo, INT8OID); +} diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c8d673a20e3..31caf101a94 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -345,7 +345,7 @@ apw_load_buffers(void) { unsigned forknum; - if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database, + if (fscanf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n", &blkinfo[i].database, &blkinfo[i].tablespace, &blkinfo[i].filenumber, &forknum, &blkinfo[i].blocknum) != 5) ereport(ERROR, @@ -669,7 +669,7 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged) { CHECK_FOR_INTERRUPTS(); - ret = fprintf(file, "%u,%u,%u,%u,%u\n", + ret = fprintf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n", block_info_array[i].database, block_info_array[i].tablespace, block_info_array[i].filenumber, diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out index a1ee743457c..e9b06ed3af2 100644 --- a/contrib/pg_walinspect/expected/pg_walinspect.out +++ b/contrib/pg_walinspect/expected/pg_walinspect.out @@ -54,9 +54,9 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1'); -- =================================================================== -- Test for filtering out WAL records of a particular table -- =================================================================== -SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset +SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2') - WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap'; + WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap'; ok ---- t diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql index 1b265ea7bcc..53938341257 100644 --- a/contrib/pg_walinspect/sql/pg_walinspect.sql +++ b/contrib/pg_walinspect/sql/pg_walinspect.sql @@ -39,10 +39,10 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1'); -- Test for filtering out WAL records of a particular table -- =================================================================== -SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset +SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2') - WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap'; + WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap'; -- =================================================================== -- Test for filtering out WAL records based on resource_manager and diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 00f833d210e..40d4e9c35e6 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -1984,7 +1984,7 @@ SCRAM-SHA-256$<iteration count>:&l - relfilenode oid + relfilenode int8 Name of the on-disk file of this relation; zero means this diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 546213fa931..d8718ed61e6 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -25210,6 +25210,11 @@ SELECT collation for ('foo' COLLATE "de_DE"); timestamp with time zone + + next_relfilenumber + timestamp with time zone + + diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index a06fd3e26de..e2222655804 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -62,7 +62,7 @@ - relfilenode oid + relfilenode int8 (references pg_class.relfilenode) diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml index e5b9f3f1ffa..d9e9b0f43ee 100644 --- a/doc/src/sgml/storage.sgml +++ b/doc/src/sgml/storage.sgml @@ -217,11 +217,12 @@ with the suffix _init (see ). -Note that while a table's filenode often matches its OID, this is -not necessarily the case; some operations, like -TRUNCATE, REINDEX, CLUSTER and some forms -of ALTER TABLE, can change the filenode while preserving the OID. -Avoid assuming that filenode and table OID are the same. +Note that a table's filenode will normally be different than the OID. For +system tables, the initial filenode will be equal to the table OID, but it will +be different if the table has ever been subjected to a rewriting operation, +such as TRUNCATE, REINDEX, +CLUSTER or some forms of ALTER TABLE. +For user tables, even the initial filenode will be different than the table OID. Also, for certain system catalogs including pg_class itself, pg_class.relfilenode contains zero. The actual filenode number of these catalogs is stored in a lower-level data diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 41b92115bff..bc093f2a887 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -100,7 +100,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda BlockNumber blknum; BufferGetTag(buffer, &locator, &forknum, &blknum); - elog(ERROR, "failed to add item to index page in %u/%u/%u", + elog(ERROR, "failed to add item to index page in %u/%u/" UINT64_FORMAT, locator.spcOid, locator.dbOid, locator.relNumber); } } diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 7dd3c1d500f..d1c8a24d66f 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -26,7 +26,7 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec) static void out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec) { - appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; blk %u; latestRemovedXid %u:%u", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, xlrec->block, EpochFromFullTransactionId(xlrec->latestRemovedFullXid), diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 923d3bc43df..70bd49303a9 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -169,7 +169,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record) { xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; - appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", + appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; tid %u/%u", xlrec->target_locator.spcOid, xlrec->target_locator.dbOid, xlrec->target_locator.relNumber, diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 4843cd530df..6192a7ba841 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -100,7 +100,7 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec; - appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u", + appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; latestRemovedXid %u:%u", xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber, EpochFromFullTransactionId(xlrec->latestRemovedFullXid), diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c index b3845f93bff..df72caf1768 100644 --- a/src/backend/access/rmgrdesc/seqdesc.c +++ b/src/backend/access/rmgrdesc/seqdesc.c @@ -25,7 +25,7 @@ seq_desc(StringInfo buf, XLogReaderState *record) xl_seq_rec *xlrec = (xl_seq_rec *) rec; if (info == XLOG_SEQ_LOG) - appendStringInfo(buf, "rel %u/%u/%u", + appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT, xlrec->locator.spcOid, xlrec->locator.dbOid, xlrec->locator.relNumber); } diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index 3fd7185f217..84a826bf49c 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -45,8 +45,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record) CheckPoint *checkpoint = (CheckPoint *) rec; appendStringInfo(buf, "redo %X/%X; " - "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; " - "oldest xid %u in DB %u; oldest multi %u in DB %u; " + "tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenumber " UINT64_FORMAT ";oid %u; " + "multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; " "oldest/newest commit timestamp xid: %u/%u; " "oldest running xid %u; %s", LSN_FORMAT_ARGS(checkpoint->redo), @@ -55,6 +55,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record) checkpoint->fullPageWrites ? "true" : "false", EpochFromFullTransactionId(checkpoint->nextXid), XidFromFullTransactionId(checkpoint->nextXid), + checkpoint->nextRelFileNumber, checkpoint->nextOid, checkpoint->nextMulti, checkpoint->nextMultiOffset, @@ -74,6 +75,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record) memcpy(&nextOid, rec, sizeof(Oid)); appendStringInfo(buf, "%u", nextOid); } + else if (info == XLOG_NEXT_RELFILENUMBER) + { + RelFileNumber nextRelFileNumber; + + memcpy(&nextRelFileNumber, rec, sizeof(RelFileNumber)); + appendStringInfo(buf, UINT64_FORMAT, nextRelFileNumber); + } else if (info == XLOG_RESTORE_POINT) { xl_restore_point *xlrec = (xl_restore_point *) rec; @@ -169,6 +177,9 @@ xlog_identify(uint8 info) case XLOG_NEXTOID: id = "NEXTOID"; break; + case XLOG_NEXT_RELFILENUMBER: + id = "NEXT_RELFILENUMBER"; + break; case XLOG_SWITCH: id = "SWITCH"; break; @@ -237,7 +248,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, appendStringInfoChar(buf, ' '); appendStringInfo(buf, - "blkref #%d: rel %u/%u/%u fork %s blk %u", + "blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forkNames[forknum], @@ -297,7 +308,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, if (forknum != MAIN_FORKNUM) { appendStringInfo(buf, - ", blkref #%d: rel %u/%u/%u fork %s blk %u", + ", blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forkNames[forknum], @@ -306,7 +317,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, else { appendStringInfo(buf, - ", blkref #%d: rel %u/%u/%u blk %u", + ", blkref #%d: rel %u/%u/" UINT64_FORMAT " blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blk); diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 72af6560600..91c2578f7a9 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -692,8 +692,9 @@ by having database restart search for files that don't have any committed entry in pg_class, but that currently isn't done because of the possibility of deleting data that is useful for forensic analysis of the crash. Orphan files are harmless --- at worst they waste a bit of disk space --- -because we check for on-disk collisions when allocating new relfilenumber -OIDs. So cleaning up isn't really necessary. +because the relfilenumber counter is monotonically increasing. The maximum +value is 2^56-1, and there is no provision for wraparound. Thus, on-disk +collisions aren't possible. 3. Deleting a table, which requires an unlink() that could fail. diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 849a7ce9d6d..f99c697c2f5 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -13,12 +13,16 @@ #include "postgres.h" +#include + #include "access/clog.h" #include "access/commit_ts.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" #include "access/xlogutils.h" +#include "catalog/pg_class.h" +#include "catalog/pg_tablespace.h" #include "commands/dbcommands.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" @@ -30,6 +34,15 @@ /* Number of OIDs to prefetch (preallocate) per XLOG write */ #define VAR_OID_PREFETCH 8192 +/* Number of RelFileNumbers to be logged per XLOG write */ +#define VAR_RELNUMBER_PER_XLOG 512 + +/* + * Need to log more if remaining logged RelFileNumbers are less than the + * threshold. Valid range could be between 0 to VAR_RELNUMBER_PER_XLOG - 1. + */ +#define VAR_RELNUMBER_NEW_XLOG_THRESHOLD 256 + /* pointer to "variable cache" in shared memory (set up by shmem.c) */ VariableCache ShmemVariableCache = NULL; @@ -521,8 +534,7 @@ ForceTransactionIdLimitUpdate(void) * wide, counter wraparound will occur eventually, and therefore it is unwise * to assume they are unique unless precautions are taken to make them so. * Hence, this routine should generally not be used directly. The only direct - * callers should be GetNewOidWithIndex() and GetNewRelFileNumber() in - * catalog/catalog.c. + * caller should be GetNewOidWithIndex() in catalog/catalog.c. */ Oid GetNewObjectId(void) @@ -612,6 +624,199 @@ SetNextObjectId(Oid nextOid) LWLockRelease(OidGenLock); } +/* + * GetNewRelFileNumber + * + * Similar to GetNewObjectId but instead of new Oid it generates new + * relfilenumber. + */ +RelFileNumber +GetNewRelFileNumber(Oid reltablespace, char relpersistence) +{ + RelFileNumber result; + RelFileNumber nextRelFileNumber, + loggedRelFileNumber, + flushedRelFileNumber; + + StaticAssertStmt(VAR_RELNUMBER_NEW_XLOG_THRESHOLD < VAR_RELNUMBER_PER_XLOG, + "VAR_RELNUMBER_NEW_XLOG_THRESHOLD must be smaller than VAR_RELNUMBER_PER_XLOG"); + + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot assign RelFileNumber during recovery"); + + if (IsBinaryUpgrade) + elog(ERROR, "cannot assign RelFileNumber during binary upgrade"); + + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + + nextRelFileNumber = ShmemVariableCache->nextRelFileNumber; + loggedRelFileNumber = ShmemVariableCache->loggedRelFileNumber; + flushedRelFileNumber = ShmemVariableCache->flushedRelFileNumber; + + Assert(nextRelFileNumber <= flushedRelFileNumber); + Assert(flushedRelFileNumber <= loggedRelFileNumber); + + /* check for the wraparound for the relfilenumber counter */ + if (unlikely(nextRelFileNumber > MAX_RELFILENUMBER)) + elog(ERROR, "relfilenumber is too large"); + + /* + * If the remaining logged relfilenumbers values are less than the + * threshold value then log more. Ideally, we can wait until all + * relfilenumbers have been consumed before logging more. Nevertheless, if + * we do that, we must immediately flush the logged wal record because we + * want to ensure that the nextRelFileNumber is always larger than any + * relfilenumber already in use on disk. And, to maintain that invariant, + * we must make sure that the record we log reaches the disk before any new + * files are created with the newly logged range. + * + * So in order to avoid flushing the wal immediately, we always log before + * consuming all the relfilenumber, and now we only have to flush the newly + * logged relfilenumber wal before consuming the relfilenumber from this + * new range. By the time we need to flush this wal, hopefully, those have + * already been flushed with some other XLogFlush operation. + */ + if (loggedRelFileNumber - nextRelFileNumber <= + VAR_RELNUMBER_NEW_XLOG_THRESHOLD) + { + XLogRecPtr recptr; + + loggedRelFileNumber = loggedRelFileNumber + VAR_RELNUMBER_PER_XLOG; + recptr = LogNextRelFileNumber(loggedRelFileNumber); + ShmemVariableCache->loggedRelFileNumber = loggedRelFileNumber; + + /* remember for the future flush */ + ShmemVariableCache->loggedRelFileNumberRecPtr = recptr; + } + + /* + * If the nextRelFileNumber is already reached to the already flushed + * relfilenumber then flush the WAL for previously logged relfilenumber. + */ + if (nextRelFileNumber >= flushedRelFileNumber) + { + XLogFlush(ShmemVariableCache->loggedRelFileNumberRecPtr); + ShmemVariableCache->flushedRelFileNumber = loggedRelFileNumber; + } + + result = ShmemVariableCache->nextRelFileNumber; + + /* we should never be using any relfilenumber outside the flushed range */ + Assert(result <= ShmemVariableCache->flushedRelFileNumber); + + (ShmemVariableCache->nextRelFileNumber)++; + + LWLockRelease(RelFileNumberGenLock); + + /* + * Because the RelFileNumber counter only ever increases and never wraps + * around, it should be impossible for the newly-allocated RelFileNumber to + * already be in use. But, if Asserts are enabled, double check that + * there's no main-fork relation file with the new RelFileNumber already on + * disk. + */ +#ifdef USE_ASSERT_CHECKING + { + RelFileLocatorBackend rlocator; + char *rpath; + BackendId backend; + + switch (relpersistence) + { + case RELPERSISTENCE_TEMP: + backend = BackendIdForTempRelations(); + break; + case RELPERSISTENCE_UNLOGGED: + case RELPERSISTENCE_PERMANENT: + backend = InvalidBackendId; + break; + default: + elog(ERROR, "invalid relpersistence: %c", relpersistence); + } + + /* this logic should match RelationInitPhysicalAddr */ + rlocator.locator.spcOid = + reltablespace ? reltablespace : MyDatabaseTableSpace; + rlocator.locator.dbOid = (reltablespace == GLOBALTABLESPACE_OID) ? + InvalidOid : MyDatabaseId; + rlocator.locator.relNumber = result; + + /* + * The relpath will vary based on the backend ID, so we must + * initialize that properly here to make sure that any collisions + * based on filename are properly detected. + */ + rlocator.backend = backend; + + /* check for existing file of same name. */ + rpath = relpath(rlocator, MAIN_FORKNUM); + Assert(access(rpath, F_OK) != 0); + } +#endif + + return result; +} + +/* + * SetNextRelFileNumber + * + * This may only be called during pg_upgrade; it advances the RelFileNumber + * counter to the specified value if the current value is smaller than the + * input value. + */ +void +SetNextRelFileNumber(RelFileNumber relnumber) +{ + /* safety check, we should never get this far in a HS standby */ + if (RecoveryInProgress()) + elog(ERROR, "cannot set RelFileNumber during recovery"); + + if (!IsBinaryUpgrade) + elog(ERROR, "RelFileNumber can be set only during binary upgrade"); + + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + + /* + * If previous assigned value of the nextRelFileNumber is already higher + * than the current value then nothing to be done. This is possible + * because during upgrade the objects are not created in relfilenumber + * order. + */ + if (relnumber <= ShmemVariableCache->nextRelFileNumber) + { + LWLockRelease(RelFileNumberGenLock); + return; + } + + /* + * If the new relfilenumber to be set is greater than or equal to already + * flushed relfilenumber then log more and flush immediately. + * + * (This is less efficient than GetNewRelFileNumber, which arranges to + * log some new relfilenumbers before the old batch is exhausted in the + * hope that a flush will happen in the background before any values are + * needed from the new batch. However, since thais is only used during + * binary upgrade, it shouldn't really matter.) + */ + if (relnumber >= ShmemVariableCache->flushedRelFileNumber) + { + RelFileNumber newlogrelnum; + + newlogrelnum = relnumber + VAR_RELNUMBER_PER_XLOG; + XLogFlush(LogNextRelFileNumber(newlogrelnum)); + + /* we have flushed whatever we have logged so no pending flush */ + ShmemVariableCache->loggedRelFileNumber = newlogrelnum; + ShmemVariableCache->flushedRelFileNumber = newlogrelnum; + ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr; + } + + ShmemVariableCache->nextRelFileNumber = relnumber; + + LWLockRelease(RelFileNumberGenLock); +} + /* * StopGeneratingPinnedObjectIds * diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1dd6df0fe15..dff9b8d2366 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4712,6 +4712,7 @@ BootStrapXLOG(void) checkPoint.nextXid = FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId); checkPoint.nextOid = FirstGenbkiObjectId; + checkPoint.nextRelFileNumber = FirstNormalRelFileNumber; checkPoint.nextMulti = FirstMultiXactId; checkPoint.nextMultiOffset = 0; checkPoint.oldestXid = FirstNormalTransactionId; @@ -4725,7 +4726,11 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5191,7 +5196,10 @@ StartupXLOG(void) /* initialize shared memory variables from the checkpoint record */ ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; ShmemVariableCache->oidCount = 0; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -6663,6 +6671,24 @@ CreateCheckPoint(int flags) checkPoint.nextOid += ShmemVariableCache->oidCount; LWLockRelease(OidGenLock); + /* + * If this is a shutdown checkpoint then we can safely start allocating + * relfilenumber from the nextRelFileNumber value after the restart because + * no one one else can use the relfilenumber beyond that number before the + * shutdown. OTOH, if it is a normal checkpoint then if there is a crash + * after this point then we might end up reusing the same relfilenumbers + * after the restart so we need to set the nextRelFileNumber to the already + * logged relfilenumber as no one will use number beyond this limit without + * logging again. + */ + LWLockAcquire(RelFileNumberGenLock, LW_SHARED); + if (shutdown) + checkPoint.nextRelFileNumber = ShmemVariableCache->nextRelFileNumber; + else + checkPoint.nextRelFileNumber = ShmemVariableCache->loggedRelFileNumber; + + LWLockRelease(RelFileNumberGenLock); + MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset, @@ -7540,6 +7566,24 @@ XLogPutNextOid(Oid nextOid) */ } +/* + * Similar to the XLogPutNextOid but instead of writing NEXTOID log record it + * writes a NEXT_RELFILENUMBER log record. It also returns the XLogRecPtr of + * the currently logged relfilenumber record, so that the caller can flush it + * at the appropriate time. + */ +XLogRecPtr +LogNextRelFileNumber(RelFileNumber nextrelnumber) +{ + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterData((char *) (&nextrelnumber), sizeof(RelFileNumber)); + recptr = XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENUMBER); + + return recptr; +} + /* * Write an XLOG SWITCH record. * @@ -7755,6 +7799,17 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); } + if (info == XLOG_NEXT_RELFILENUMBER) + { + RelFileNumber nextRelFileNumber; + + memcpy(&nextRelFileNumber, XLogRecGetData(record), sizeof(RelFileNumber)); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumber = nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = nextRelFileNumber; + LWLockRelease(RelFileNumberGenLock); + } else if (info == XLOG_CHECKPOINT_SHUTDOWN) { CheckPoint checkPoint; @@ -7769,6 +7824,11 @@ xlog_redo(XLogReaderState *record) ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; LWLockRelease(OidGenLock); + LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE); + ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber; + ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber; + LWLockRelease(RelFileNumberGenLock); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 8f5d4253320..cea38eccea6 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -613,7 +613,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation", + "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, which creates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -636,7 +636,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation", + "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, which truncates the relation", xlrec->rlocator.spcOid, xlrec->rlocator.dbOid, xlrec->rlocator.relNumber, @@ -735,7 +735,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", + "suppressing all prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, because the relation does not exist on disk", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -756,7 +756,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", + "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, because the relation is too small", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -795,7 +795,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * truncated beneath our feet? */ elog(ERROR, - "could not prefetch relation %u/%u/%u block %u", + "could not prefetch relation %u/%u/" UINT64_FORMAT " block %u", reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, reln->smgr_rlocator.locator.relNumber, @@ -934,7 +934,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", + "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed), filter->filter_from_block); @@ -950,7 +950,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator, { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, - "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", + "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (whole database)", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno, LSN_FORMAT_ARGS(filter->filter_until_replayed)); #endif diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index b41e6826643..1026ce5dcf7 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -2228,14 +2228,14 @@ xlog_block_info(StringInfo buf, XLogReaderState *record) continue; if (forknum != MAIN_FORKNUM) - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", fork %u, blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forknum, blk); else - appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", + appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", blk %u", block_id, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, @@ -2433,7 +2433,7 @@ verifyBackupPageConsistency(XLogReaderState *record) if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) { elog(FATAL, - "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + "inconsistent page found, rel %u/%u/" UINT64_FORMAT ", forknum %u, blkno %u", rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, forknum, blkno); } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 563cba258dd..ffda2c210b7 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -619,17 +619,17 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator) rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; /* We don't know the name of the relation; use relfilenumber instead */ - sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber); + sprintf(RelationGetRelationName(rel), UINT64_FORMAT, rlocator.relNumber); /* * We set up the lockRelId in case anything tries to lock the dummy - * relation. Note that this is fairly bogus since relNumber may be + * relation. Note that this is fairly bogus since relNumber are completely * different from the relation's OID. It shouldn't really matter though. * In recovery, we are running by ourselves and can't have any lock * conflicts. While syncing, we already hold AccessExclusiveLock. */ rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid; - rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber; + rel->rd_lockInfo.lockRelId.relId = (Oid) rlocator.relNumber; rel->rd_smgr = NULL; diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 411cac9be3f..1434bcdd85c 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -1246,7 +1246,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly, if (relForkNum != INIT_FORKNUM) { char initForkFile[MAXPGPATH]; - char relNumber[OIDCHARS + 1]; + char relNumber[RELNUMBERCHARS + 1]; /* * If any other type of fork, check if there is an init fork diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 2abd6b007a2..a9bd8ae008e 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -482,101 +482,6 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) return newOid; } -/* - * GetNewRelFileNumber - * Generate a new relfilenumber that is unique within the - * database of the given tablespace. - * - * If the relfilenumber will also be used as the relation's OID, pass the - * opened pg_class catalog, and this routine will guarantee that the result - * is also an unused OID within pg_class. If the result is to be used only - * as a relfilenumber for an existing relation, pass NULL for pg_class. - * - * As with GetNewOidWithIndex(), there is some theoretical risk of a race - * condition, but it doesn't seem worth worrying about. - * - * Note: we don't support using this in bootstrap mode. All relations - * created by bootstrap have preassigned OIDs, so there's no need. - */ -RelFileNumber -GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) -{ - RelFileLocatorBackend rlocator; - char *rpath; - bool collides; - BackendId backend; - - /* - * If we ever get here during pg_upgrade, there's something wrong; all - * relfilenumber assignments during a binary-upgrade run should be - * determined by commands in the dump script. - */ - Assert(!IsBinaryUpgrade); - - switch (relpersistence) - { - case RELPERSISTENCE_TEMP: - backend = BackendIdForTempRelations(); - break; - case RELPERSISTENCE_UNLOGGED: - case RELPERSISTENCE_PERMANENT: - backend = InvalidBackendId; - break; - default: - elog(ERROR, "invalid relpersistence: %c", relpersistence); - return InvalidRelFileNumber; /* placate compiler */ - } - - /* This logic should match RelationInitPhysicalAddr */ - rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; - rlocator.locator.dbOid = - (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ? - InvalidOid : MyDatabaseId; - - /* - * The relpath will vary based on the backend ID, so we must initialize - * that properly here to make sure that any collisions based on filename - * are properly detected. - */ - rlocator.backend = backend; - - do - { - CHECK_FOR_INTERRUPTS(); - - /* Generate the OID */ - if (pg_class) - rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, - Anum_pg_class_oid); - else - rlocator.locator.relNumber = GetNewObjectId(); - - /* Check for existing file of same name */ - rpath = relpath(rlocator, MAIN_FORKNUM); - - if (access(rpath, F_OK) == 0) - { - /* definite collision */ - collides = true; - } - else - { - /* - * Here we have a little bit of a dilemma: if errno is something - * other than ENOENT, should we declare a collision and loop? In - * practice it seems best to go ahead regardless of the errno. If - * there is a colliding file we will get an smgr failure when we - * attempt to create the new relation file. - */ - collides = false; - } - - pfree(rpath); - } while (collides); - - return rlocator.locator.relNumber; -} - /* * SQL callable interface for GetNewOidWithIndex(). Outside of initdb's * direct insertions into catalog tables, and recovering from corruption, this diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 9a80ccdccdf..de01da198e3 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -341,11 +341,19 @@ heap_create(const char *relname, else { /* - * If relfilenumber is unspecified by the caller then create storage - * with oid same as relid. + * If relfilenumber is unspecified by the caller then allocate a new + * one, except for system tables, for which we make the initial + * relfilenumber the same as the table OID. See the comments for + * FirstNormalRelFileNumber for an explanation of why we do this. */ if (!RelFileNumberIsValid(relfilenumber)) - relfilenumber = relid; + { + if (relid < FirstNormalObjectId) + relfilenumber = relid; + else + relfilenumber = GetNewRelFileNumber(reltablespace, + relpersistence); + } } /* @@ -901,7 +909,7 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_reloftype - 1] = ObjectIdGetDatum(rd_rel->reloftype); values[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(rd_rel->relowner); values[Anum_pg_class_relam - 1] = ObjectIdGetDatum(rd_rel->relam); - values[Anum_pg_class_relfilenode - 1] = ObjectIdGetDatum(rd_rel->relfilenode); + values[Anum_pg_class_relfilenode - 1] = Int64GetDatum(rd_rel->relfilenode); values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace); values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages); values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples); @@ -1173,12 +1181,7 @@ heap_create_with_catalog(const char *relname, if (shared_relation && reltablespace != GLOBALTABLESPACE_OID) elog(ERROR, "shared relations must be placed in pg_global tablespace"); - /* - * Allocate an OID for the relation, unless we were told what to use. - * - * The OID will be the relfilenumber as well, so make sure it doesn't - * collide with either pg_class OIDs or existing physical files. - */ + /* Allocate an OID for the relation, unless we were told what to use. */ if (!OidIsValid(relid)) { /* Use binary-upgrade override for pg_class.oid and relfilenumber */ @@ -1232,8 +1235,8 @@ heap_create_with_catalog(const char *relname, } if (!OidIsValid(relid)) - relid = GetNewRelFileNumber(reltablespace, pg_class_desc, - relpersistence); + relid = GetNewOidWithIndex(pg_class_desc, ClassOidIndexId, + Anum_pg_class_oid); } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 61f1d3926a9..1fd40c42a3a 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -898,12 +898,7 @@ index_create(Relation heapRelation, collationObjectId, classObjectId); - /* - * Allocate an OID for the index, unless we were told what to use. - * - * The OID will be the relfilenumber as well, so make sure it doesn't - * collide with either pg_class OIDs or existing physical files. - */ + /* Allocate an OID for the index, unless we were told what to use. */ if (!OidIsValid(indexRelationId)) { /* Use binary-upgrade override for pg_class.oid and relfilenumber */ @@ -935,8 +930,8 @@ index_create(Relation heapRelation, } else { - indexRelationId = - GetNewRelFileNumber(tableSpaceId, pg_class, relpersistence); + indexRelationId = GetNewOidWithIndex(pg_class, ClassOidIndexId, + Anum_pg_class_oid); } } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index d708af19ed2..021e08580fb 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -968,6 +968,10 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; + if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber) + elog(ERROR, "unexpected relnumber " UINT64_FORMAT " that is bigger than nextRelFileNumber " UINT64_FORMAT, + xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber); + reln = smgropen(xlrec->rlocator, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } @@ -981,6 +985,10 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; + if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber) + elog(ERROR, "unexpected relnumber " UINT64_FORMAT "that is bigger than nextRelFileNumber " UINT64_FORMAT, + xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber); + reln = smgropen(xlrec->rlocator, InvalidBackendId); /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 7d8a75d23c2..1b8e6d57294 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14375,10 +14375,14 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) } /* - * Relfilenumbers are not unique in databases across tablespaces, so we - * need to allocate a new one in the new tablespace. - */ - newrelfilenumber = GetNewRelFileNumber(newTableSpace, NULL, + * Generate a new relfilenumber. We cannot reuse the old relfilenumber + * because of the possibility that that relation will be moved back to the + * original tablespace before the next checkpoint. At that point, the + * first segment of the main fork won't have been unlinked yet, and an + * attempt to create new relation storage with that same relfilenumber + * will fail. + */ + newrelfilenumber = GetNewRelFileNumber(newTableSpace, rel->rd_rel->relpersistence); /* Open old and new relation */ diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index b69ff37dbbd..cdd7986dfc3 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -267,7 +267,7 @@ CreateTableSpace(CreateTableSpaceStmt *stmt) * parts. */ if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 + - OIDCHARS + 1 + OIDCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH) + OIDCHARS + 1 + RELNUMBERCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 81b8c184a90..f1fa8945135 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -961,12 +961,12 @@ _read${n}(void) print $off "\tWRITE_UINT_FIELD($f);\n"; print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read; } - elsif ($t eq 'uint64') + elsif ($t eq 'uint64' || $t eq 'RelFileNumber') { print $off "\tWRITE_UINT64_FIELD($f);\n"; print $rff "\tREAD_UINT64_FIELD($f);\n" unless $no_read; } - elsif ($t eq 'Oid' || $t eq 'RelFileNumber') + elsif ($t eq 'Oid') { print $off "\tWRITE_OID_FIELD($f);\n"; print $rff "\tREAD_OID_FIELD($f);\n" unless $no_read; diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 2cc0ac9eb09..cdf19a9c204 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) break; case XLOG_NOOP: case XLOG_NEXTOID: + case XLOG_NEXT_RELFILENUMBER: case XLOG_SWITCH: case XLOG_BACKUP_END: case XLOG_PARAMETER_CHANGE: diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index 03d9c9c86a2..a0f398b458a 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -4932,7 +4932,7 @@ DisplayMapping(HTAB *tuplecid_data) hash_seq_init(&hstat, tuplecid_data); while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL) { - elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u", + elog(DEBUG3, "mapping: node: %u/%u/" UINT64_FORMAT " tid: %u/%u cmin: %u, cmax: %u", ent->key.rlocator.dbOid, ent->key.rlocator.spcOid, ent->key.rlocator.relNumber, diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 647c458b52e..c3faa68126a 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -31,7 +31,7 @@ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, typedef struct { - Oid reloid; /* hash key */ + RelFileNumber relnumber; /* hash key */ } unlogged_relation_entry; /* @@ -184,10 +184,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) * need to be reset. Otherwise, this cleanup operation would be * O(n^2). */ - ctl.keysize = sizeof(Oid); + ctl.keysize = sizeof(RelFileNumber); ctl.entrysize = sizeof(unlogged_relation_entry); ctl.hcxt = CurrentMemoryContext; - hash = hash_create("unlogged relation OIDs", 32, &ctl, + hash = hash_create("unlogged relation RelFileNumbers", 32, &ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); /* Scan the directory. */ @@ -208,10 +208,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) continue; /* - * Put the OID portion of the name into the hash table, if it - * isn't already. + * Put the RELFILENUMBER portion of the name into the hash table, + * if it isn't already. */ - ent.reloid = atooid(de->d_name); + ent.relnumber = atorelnumber(de->d_name); (void) hash_search(hash, &ent, HASH_ENTER, NULL); } @@ -248,10 +248,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) continue; /* - * See whether the OID portion of the name shows up in the hash - * table. If so, nuke it! + * See whether the RELFILENUMBER portion of the name shows up in + * the hash table. If so, nuke it! */ - ent.reloid = atooid(de->d_name); + ent.relnumber = atorelnumber(de->d_name); if (hash_search(hash, &ent, HASH_FIND, NULL)) { snprintf(rm_path, sizeof(rm_path), "%s/%s", @@ -286,7 +286,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { ForkNumber forkNum; int relnumchars; - char relnumbuf[OIDCHARS + 1]; + char relnumbuf[RELNUMBERCHARS + 1]; char srcpath[MAXPGPATH * 2]; char dstpath[MAXPGPATH]; @@ -329,7 +329,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { ForkNumber forkNum; int relnumchars; - char relnumbuf[OIDCHARS + 1]; + char relnumbuf[RELNUMBERCHARS + 1]; char mainpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ @@ -372,8 +372,8 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) * for a non-temporary relation and false otherwise. * * NB: If this function returns true, the caller is entitled to assume that - * *relnumchars has been set to a value no more than OIDCHARS, and thus - * that a buffer of OIDCHARS+1 characters is sufficient to hold the + * *relnumchars has been set to a value no more than RELNUMBERCHARS, and thus + * that a buffer of RELNUMBERCHARS+1 characters is sufficient to hold the * RelFileNumber portion of the filename. This is critical to protect against * a possible buffer overrun. */ @@ -386,7 +386,7 @@ parse_filename_for_nontemp_relation(const char *name, int *relnumchars, /* Look for a non-empty string of digits (that isn't too long). */ for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) ; - if (pos == 0 || pos > OIDCHARS) + if (pos == 0 || pos > RELNUMBERCHARS) return false; *relnumchars = pos; diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c index af4dab7d2c7..1210be7470b 100644 --- a/src/backend/storage/freespace/fsmpage.c +++ b/src/backend/storage/freespace/fsmpage.c @@ -273,7 +273,7 @@ restart: BlockNumber blknum; BufferGetTag(buf, &rlocator, &forknum, &blknum); - elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", + elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" UINT64_FORMAT, blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber); /* make sure we hold an exclusive lock */ diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..3c5d0410795 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +RelFileNumberGenLock 48 \ No newline at end of file diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index a515bb36ac1..bed47f07d73 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -257,6 +257,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) * next checkpoint, we prevent reassignment of the relfilenumber until it's * safe, because relfilenumber assignment skips over any existing file. * + * XXX. Although all of this was true when relfilenumbers were 32 bits wide, + * they are now 56 bits wide and do not wrap around, so in the future we can + * change the code to immediately unlink the first segment of the relation + * along with all the others. We still do reuse relfilenumbers when createdb() + * is performed using the file-copy method or during movedb(), but the scenario + * described above can only happen when creating a new relation. + * * We do not need to go through this dance for temp relations, though, because * we never make WAL entries for temp rels, and so a temp rel poses no threat * to the health of a regular rel that has taken over its relfilenumber. diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c1a5febcbfd..ed46ac3f44e 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -154,7 +154,7 @@ smgropen(RelFileLocator rlocator, BackendId backend) /* First time through: initialize the hash table */ HASHCTL ctl; - ctl.keysize = sizeof(RelFileLocatorBackend); + ctl.keysize = SizeOfRelFileLocatorBackend; ctl.entrysize = sizeof(SMgrRelationData); SMgrRelationHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_BLOBS); diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index 34efa121b40..9f70f3526c9 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -878,7 +878,7 @@ pg_relation_filenode(PG_FUNCTION_ARGS) if (!RelFileNumberIsValid(result)) PG_RETURN_NULL(); - PG_RETURN_OID(result); + PG_RETURN_INT64(result); } /* @@ -898,9 +898,12 @@ Datum pg_filenode_relation(PG_FUNCTION_ARGS) { Oid reltablespace = PG_GETARG_OID(0); - RelFileNumber relfilenumber = PG_GETARG_OID(1); + RelFileNumber relfilenumber = PG_GETARG_INT64(1); Oid heaprel; + /* check whether the relfilenumber is within a valid range */ + CHECK_RELFILENUMBER_RANGE(relfilenumber); + /* test needed so RelidByRelfilenumber doesn't misbehave */ if (!RelFileNumberIsValid(relfilenumber)) PG_RETURN_NULL(); diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c index 797f5f539af..fc2faed9a7d 100644 --- a/src/backend/utils/adt/pg_upgrade_support.c +++ b/src/backend/utils/adt/pg_upgrade_support.c @@ -17,6 +17,7 @@ #include "catalog/pg_type.h" #include "commands/extension.h" #include "miscadmin.h" +#include "storage/relfilelocator.h" #include "utils/array.h" #include "utils/builtins.h" @@ -98,10 +99,12 @@ binary_upgrade_set_next_heap_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_heap_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; + CHECK_RELFILENUMBER_RANGE(relfilenumber); binary_upgrade_next_heap_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } @@ -120,10 +123,12 @@ binary_upgrade_set_next_index_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_index_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; + CHECK_RELFILENUMBER_RANGE(relfilenumber); binary_upgrade_next_index_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } @@ -142,10 +147,12 @@ binary_upgrade_set_next_toast_pg_class_oid(PG_FUNCTION_ARGS) Datum binary_upgrade_set_next_toast_relfilenode(PG_FUNCTION_ARGS) { - RelFileNumber relfilenumber = PG_GETARG_OID(0); + RelFileNumber relfilenumber = PG_GETARG_INT64(0); CHECK_IS_BINARY_UPGRADE; + CHECK_RELFILENUMBER_RANGE(relfilenumber); binary_upgrade_next_toast_pg_class_relfilenumber = relfilenumber; + SetNextRelFileNumber(relfilenumber + 1); PG_RETURN_VOID(); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 00dc0f24037..6f4e96dd33b 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3712,7 +3712,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) { /* Allocate a new relfilenumber */ newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace, - NULL, persistence); + persistence); } else if (relation->rd_rel->relkind == RELKIND_INDEX) { diff --git a/src/backend/utils/cache/relfilenumbermap.c b/src/backend/utils/cache/relfilenumbermap.c index c4245d5ccdd..2e0acf98f20 100644 --- a/src/backend/utils/cache/relfilenumbermap.c +++ b/src/backend/utils/cache/relfilenumbermap.c @@ -196,7 +196,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber) /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); - skey[1].sk_argument = ObjectIdGetDatum(relfilenumber); + skey[1].sk_argument = Int64GetDatum((int64) relfilenumber); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, @@ -213,7 +213,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber) if (found) elog(ERROR, - "unexpected duplicate for tablespace %u, relfilenumber %u", + "unexpected duplicate for tablespace %u, relfilenumber " UINT64_FORMAT, reltablespace, relfilenumber); found = true; diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c index 781f8b87580..d441cd97e2f 100644 --- a/src/backend/utils/misc/pg_controldata.c +++ b/src/backend/utils/misc/pg_controldata.c @@ -79,8 +79,8 @@ pg_control_system(PG_FUNCTION_ARGS) Datum pg_control_checkpoint(PG_FUNCTION_ARGS) { - Datum values[18]; - bool nulls[18]; + Datum values[19]; + bool nulls[19]; TupleDesc tupdesc; HeapTuple htup; ControlFileData *ControlFile; @@ -129,6 +129,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time", TIMESTAMPTZOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 19, "next_relfilenumber", + INT8OID, -1, 0); tupdesc = BlessTupleDesc(tupdesc); /* Read the control file. */ @@ -202,6 +204,9 @@ pg_control_checkpoint(PG_FUNCTION_ARGS) values[17] = TimestampTzGetDatum(time_t_to_timestamptz(ControlFile->checkPointCopy.time)); nulls[17] = false; + values[18] = Int64GetDatum((int64) ControlFile->checkPointCopy.nextRelFileNumber); + nulls[18] = false; + htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 324ccf77834..ddb5ec117f2 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -485,9 +485,7 @@ main(int argc, char *argv[]) mode = PG_MODE_ENABLE; break; case 'f': - if (!option_parse_int(optarg, "-f/--filenode", 0, - INT_MAX, - NULL)) + if (!option_parse_relfilenumber(optarg, "-f/--filenode")) exit(1); only_filenode = pstrdup(optarg); break; diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index c390ec51ce9..2f0e91fc2f9 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -250,6 +250,8 @@ main(int argc, char *argv[]) printf(_("Latest checkpoint's NextXID: %u:%u\n"), EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid), XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)); + printf(_("Latest checkpoint's NextRelFileNumber:%llu\n"), + (unsigned long long) ControlFile->checkPointCopy.nextRelFileNumber); printf(_("Latest checkpoint's NextOID: %u\n"), ControlFile->checkPointCopy.nextOid); printf(_("Latest checkpoint's NextMultiXactId: %u\n"), diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index bd9b066e4eb..9f78971cab5 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -3184,15 +3184,15 @@ dumpDatabase(Archive *fout) atooid(PQgetvalue(lo_res, i, ii_oid))); oid = atooid(PQgetvalue(lo_res, i, ii_oid)); - relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode)); + relfilenumber = atorelnumber(PQgetvalue(lo_res, i, ii_relfilenode)); if (oid == LargeObjectRelationId) appendPQExpBuffer(loOutQry, - "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); else if (oid == LargeObjectLOidPNIndexId) appendPQExpBuffer(loOutQry, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); } @@ -4877,16 +4877,16 @@ binary_upgrade_set_pg_class_oids(Archive *fout, relkind = *PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relkind")); - relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "relfilenode"))); + relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "relfilenode"))); toast_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid"))); - toast_relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "toast_relfilenode"))); + toast_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "toast_relfilenode"))); toast_index_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "indexrelid"))); - toast_index_relfilenumber = atooid(PQgetvalue(upgrade_res, 0, - PQfnumber(upgrade_res, "toast_index_relfilenode"))); + toast_index_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0, + PQfnumber(upgrade_res, "toast_index_relfilenode"))); appendPQExpBufferStr(upgrade_buffer, "\n-- For binary upgrade, must preserve pg_class oids and relfilenodes\n"); @@ -4904,7 +4904,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, */ if (RelFileNumberIsValid(relfilenumber) && relkind != RELKIND_PARTITIONED_TABLE) appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); /* @@ -4918,7 +4918,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n", toast_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", toast_relfilenumber); /* every toast table has an index */ @@ -4926,7 +4926,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", toast_index_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", toast_index_relfilenumber); } @@ -4939,7 +4939,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout, "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n", pg_class_oid); appendPQExpBuffer(upgrade_buffer, - "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n", + "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n", relfilenumber); } diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index 269ed6446e6..197ec0eac91 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -538,7 +538,7 @@ isRelDataFile(const char *path) segNo = 0; matched = false; - nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo); + nmatch = sscanf(path, "global/" UINT64_FORMAT ".%u", &rlocator.relNumber, &segNo); if (nmatch == 1 || nmatch == 2) { rlocator.spcOid = GLOBALTABLESPACE_OID; @@ -547,7 +547,7 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "base/%u/%u.%u", + nmatch = sscanf(path, "base/%u/" UINT64_FORMAT ".%u", &rlocator.dbOid, &rlocator.relNumber, &segNo); if (nmatch == 2 || nmatch == 3) { @@ -556,7 +556,7 @@ isRelDataFile(const char *path) } else { - nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u", + nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/" UINT64_FORMAT ".%u", &rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber, &segNo); if (nmatch == 3 || nmatch == 4) diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index f18cf971202..0c712a62669 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -527,7 +527,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) relname = PQgetvalue(res, relnum, i_relname); curr->relname = pg_strdup(relname); - curr->relfilenumber = atooid(PQgetvalue(res, relnum, i_relfilenumber)); + curr->relfilenumber = + atorelnumber(PQgetvalue(res, relnum, i_relfilenumber)); curr->tblsp_alloc = false; /* Is the tablespace oid non-default? */ diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 115faa222e3..7ab1bcc9c8d 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -15,10 +15,8 @@ * oids are the same between old and new clusters. This is important * because toast oids are stored as toast pointers in user tables. * - * While pg_class.oid and pg_class.relfilenode are initially the same in a - * cluster, they can diverge due to CLUSTER, REINDEX, or VACUUM FULL. We - * control assignments of pg_class.relfilenode because we want the filenames - * to match between the old and new cluster. + * We control assignments of pg_class.relfilenode because we want the + * filenames to match between the old and new cluster. * * We control assignment of pg_tablespace.oid because we want the oid to match * between the old and new cluster. diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index c3f3d6bc0af..529267d670a 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -190,14 +190,14 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro else snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno); - snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", + snprintf(old_file, sizeof(old_file), "%s%s/%u/" UINT64_FORMAT "%s%s", map->old_tablespace, map->old_tablespace_suffix, map->db_oid, map->relfilenumber, type_suffix, extent_suffix); - snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", + snprintf(new_file, sizeof(new_file), "%s%s/%u/" UINT64_FORMAT "%s%s", map->new_tablespace, map->new_tablespace_suffix, map->db_oid, diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 9993378ca58..6fdc7dcf529 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -884,7 +884,7 @@ main(int argc, char **argv) } break; case 'R': - if (sscanf(optarg, "%u/%u/%u", + if (sscanf(optarg, "%u/%u/" UINT64_FORMAT, &config.filter_by_relation.spcOid, &config.filter_by_relation.dbOid, &config.filter_by_relation.relNumber) != 3 || diff --git a/src/bin/scripts/t/090_reindexdb.pl b/src/bin/scripts/t/090_reindexdb.pl index e706d686e39..de5cee6fa08 100644 --- a/src/bin/scripts/t/090_reindexdb.pl +++ b/src/bin/scripts/t/090_reindexdb.pl @@ -40,7 +40,7 @@ my $toast_index = $node->safe_psql('postgres', # REINDEX operations. A set of relfilenodes is saved from the catalogs # and then compared with pg_class. $node->safe_psql('postgres', - 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode oid);' + 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode int8);' ); # Save the relfilenode of a set of toast indexes, one from the catalog # pg_constraint and one from the test table. diff --git a/src/common/relpath.c b/src/common/relpath.c index 1b6b620ce83..d0d83e593b5 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -149,10 +149,10 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, Assert(dbOid == 0); Assert(backendId == InvalidBackendId); if (forkNumber != MAIN_FORKNUM) - path = psprintf("global/%u_%s", + path = psprintf("global/" UINT64_FORMAT "_%s", relNumber, forkNames[forkNumber]); else - path = psprintf("global/%u", relNumber); + path = psprintf("global/" UINT64_FORMAT, relNumber); } else if (spcOid == DEFAULTTABLESPACE_OID) { @@ -160,21 +160,21 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/%u_%s", + path = psprintf("base/%u/" UINT64_FORMAT "_%s", dbOid, relNumber, forkNames[forkNumber]); else - path = psprintf("base/%u/%u", + path = psprintf("base/%u/" UINT64_FORMAT, dbOid, relNumber); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("base/%u/t%d_%u_%s", + path = psprintf("base/%u/t%d_" UINT64_FORMAT "_%s", dbOid, backendId, relNumber, forkNames[forkNumber]); else - path = psprintf("base/%u/t%d_%u", + path = psprintf("base/%u/t%d_" UINT64_FORMAT, dbOid, backendId, relNumber); } } @@ -184,24 +184,24 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, if (backendId == InvalidBackendId) { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT "_%s", spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, relNumber, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/%u", + path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT, spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, relNumber); } else { if (forkNumber != MAIN_FORKNUM) - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT "_%s", spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, backendId, relNumber, forkNames[forkNumber]); else - path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u", + path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT, spcOid, TABLESPACE_VERSION_DIRECTORY, dbOid, backendId, relNumber); } diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c index abea88154ca..d4978527b41 100644 --- a/src/fe_utils/option_utils.c +++ b/src/fe_utils/option_utils.c @@ -13,6 +13,7 @@ #include "postgres_fe.h" #include "common/logging.h" +#include "common/relpath.h" #include "common/string.h" #include "fe_utils/option_utils.h" @@ -82,3 +83,42 @@ option_parse_int(const char *optarg, const char *optname, *result = val; return true; } + +/* + * option_parse_relfilenumber + * + * Parse relfilenumber value for an option. If the parsing is successful, + * returns; if parsing fails, returns false. + */ +bool +option_parse_relfilenumber(const char *optarg, const char *optname) +{ + char *endptr; + uint64 val; + + errno = 0; + val = strtou64(optarg, &endptr, 10); + + /* + * Skip any trailing whitespace; if anything but whitespace remains before + * the terminating character, fail. + */ + while (*endptr != '\0' && isspace((unsigned char) *endptr)) + endptr++; + + if (*endptr != '\0') + { + pg_log_error("invalid value \"%s\" for option %s", + optarg, optname); + return false; + } + + if (val > MAX_RELFILENUMBER) + { + pg_log_error("%s must be in range " UINT64_FORMAT ".." UINT64_FORMAT, + optname, UINT64CONST(0), MAX_RELFILENUMBER); + return false; + } + + return true; +} diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 775471d2a7d..2aaad2b9d51 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -15,6 +15,7 @@ #define TRANSAM_H #include "access/xlogdefs.h" +#include "common/relpath.h" /* ---------------- @@ -196,6 +197,33 @@ FullTransactionIdAdvance(FullTransactionId *dest) #define FirstUnpinnedObjectId 12000 #define FirstNormalObjectId 16384 +/* ---------- + * RelFileNumbers are normally assigned sequentially beginning with + * FirstNormalRelFileNumber, but for system tables the initial RelFileNumber + * is equal to the table OID. This scheme allows pg_upgrade to work: we expect + * that the new cluster will contain only system tables, and that none of those + * will have previously been rewritten, so any RelFileNumber which is in use + * in both the old and new clusters will be used for the same relation in both + * places. + * + * This is important because pg_upgrade can't reactively move conflicting + * relations out of the way. If it tries to set the RelFileNumber for a + * relation to some value that's already in use by a different relation, the + * upgrade will just fail. It's OK if the same RelFileNumber is used for the + * same relation, though, since then nothing needs to be changed. + * ---------- + */ +#define FirstNormalRelFileNumber ((RelFileNumber) 100000) + +#define CHECK_RELFILENUMBER_RANGE(relfilenumber) \ +do { \ + if ((relfilenumber) < 0 || (relfilenumber) > MAX_RELFILENUMBER) \ + ereport(ERROR, \ + errcode(ERRCODE_INVALID_PARAMETER_VALUE), \ + errmsg("relfilenumber %llu is out of range", \ + (unsigned long long) (relfilenumber))); \ +} while (0) + /* * VariableCache is a data structure in shared memory that is used to track * OID and XID assignment state. For largely historical reasons, there is @@ -214,6 +242,15 @@ typedef struct VariableCacheData Oid nextOid; /* next OID to assign */ uint32 oidCount; /* OIDs available before must do XLOG work */ + /* + * These fields are protected by RelFileNumberGenLock. + */ + RelFileNumber nextRelFileNumber; /* next relfilenumber to assign */ + RelFileNumber loggedRelFileNumber; /* last logged relfilenumber */ + RelFileNumber flushedRelFileNumber; /* last flushed relfilenumber */ + XLogRecPtr loggedRelFileNumberRecPtr; /* xlog record pointer w.r.t. + * loggedRelFileNumber */ + /* * These fields are protected by XidGenLock. */ @@ -293,6 +330,9 @@ extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid, extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); +extern RelFileNumber GetNewRelFileNumber(Oid reltablespace, + char relpersistence); +extern void SetNextRelFileNumber(RelFileNumber relnumber); extern void StopGeneratingPinnedObjectIds(void); #ifdef USE_ASSERT_CHECKING diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index dce265098e3..53375865dfd 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -236,6 +236,7 @@ extern void CreateCheckPoint(int flags); extern bool CreateRestartPoint(int flags); extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN); extern void XLogPutNextOid(Oid nextOid); +extern XLogRecPtr LogNextRelFileNumber(RelFileNumber nextrelnumber); extern XLogRecPtr XLogRestorePoint(const char *rpName); extern void UpdateFullPageWrites(void); extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p); diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h index e1c85f98550..b45253045e7 100644 --- a/src/include/catalog/catalog.h +++ b/src/include/catalog/catalog.h @@ -38,8 +38,5 @@ extern bool IsPinnedObject(Oid classId, Oid objectId); extern Oid GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn); -extern RelFileNumber GetNewRelFileNumber(Oid reltablespace, - Relation pg_class, - char relpersistence); #endif /* CATALOG_H */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 95e7c249ed8..8ba25e4dc8e 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202209261 +#define CATALOG_VERSION_NO 202209271 #endif diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index e1f4eefa220..4768e5ebda5 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -34,6 +34,13 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* oid */ Oid oid; + /* access method; 0 if not a table / index */ + Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am); + + /* identifier of physical storage file */ + /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ + int64 relfilenode BKI_DEFAULT(0); + /* class name */ NameData relname; @@ -49,13 +56,6 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* class owner */ Oid relowner BKI_DEFAULT(POSTGRES) BKI_LOOKUP(pg_authid); - /* access method; 0 if not a table / index */ - Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am); - - /* identifier of physical storage file */ - /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */ - Oid relfilenode BKI_DEFAULT(0); - /* identifier of table space for relation (0 means default for database) */ Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace); @@ -154,7 +154,7 @@ typedef FormData_pg_class *Form_pg_class; DECLARE_UNIQUE_INDEX_PKEY(pg_class_oid_index, 2662, ClassOidIndexId, on pg_class using btree(oid oid_ops)); DECLARE_UNIQUE_INDEX(pg_class_relname_nsp_index, 2663, ClassNameNspIndexId, on pg_class using btree(relname name_ops, relnamespace oid_ops)); -DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode oid_ops)); +DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode int8_ops)); #ifdef EXPOSE_TO_CLIENT_CODE diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 06368e23667..096222f1fe5 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -41,6 +41,7 @@ typedef struct CheckPoint * timeline (equals ThisTimeLineID otherwise) */ bool fullPageWrites; /* current full_page_writes */ FullTransactionId nextXid; /* next free transaction ID */ + RelFileNumber nextRelFileNumber; /* next relfilenumber */ Oid nextOid; /* next free OID */ MultiXactId nextMulti; /* next free MultiXactId */ MultiXactOffset nextMultiOffset; /* next free MultiXact offset */ @@ -78,6 +79,7 @@ typedef struct CheckPoint #define XLOG_FPI 0xB0 /* 0xC0 is used in Postgres 9.5-11 */ #define XLOG_OVERWRITE_CONTRECORD 0xD0 +#define XLOG_NEXT_RELFILENUMBER 0xE0 /* diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index a07e737a337..8b72f8a215b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -7329,11 +7329,11 @@ proname => 'pg_indexes_size', provolatile => 'v', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_indexes_size' }, { oid => '2999', descr => 'filenode identifier of relation', - proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'oid', + proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'int8', proargtypes => 'regclass', prosrc => 'pg_relation_filenode' }, { oid => '3454', descr => 'relation OID for filenode and tablespace', proname => 'pg_filenode_relation', provolatile => 's', - prorettype => 'regclass', proargtypes => 'oid oid', + prorettype => 'regclass', proargtypes => 'oid int8', prosrc => 'pg_filenode_relation' }, { oid => '3034', descr => 'file path of relation', proname => 'pg_relation_filepath', provolatile => 's', prorettype => 'text', @@ -11125,15 +11125,15 @@ prosrc => 'binary_upgrade_set_missing_value' }, { oid => '4545', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_heap_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_heap_relfilenode' }, { oid => '4546', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_index_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_index_relfilenode' }, { oid => '4547', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_toast_relfilenode', provolatile => 'v', - proparallel => 'u', prorettype => 'void', proargtypes => 'oid', + proparallel => 'u', prorettype => 'void', proargtypes => 'int8', prosrc => 'binary_upgrade_set_next_toast_relfilenode' }, { oid => '4548', descr => 'for use by pg_upgrade', proname => 'binary_upgrade_set_next_pg_tablespace_oid', provolatile => 'v', diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h index 4bbd94393c8..2d3b52fe0b8 100644 --- a/src/include/common/relpath.h +++ b/src/include/common/relpath.h @@ -22,10 +22,12 @@ /* * RelFileNumber data type identifies the specific relation file name. */ -typedef Oid RelFileNumber; -#define InvalidRelFileNumber ((RelFileNumber) InvalidOid) +typedef uint64 RelFileNumber; +#define InvalidRelFileNumber ((RelFileNumber) 0) #define RelFileNumberIsValid(relnumber) \ ((bool) ((relnumber) != InvalidRelFileNumber)) +#define atorelnumber(x) ((RelFileNumber) strtou64((x), NULL, 10)) +#define MAX_RELFILENUMBER UINT64CONST(0x00FFFFFFFFFFFFFF) /* * Name of major-version-specific tablespace subdirectories @@ -35,6 +37,7 @@ typedef Oid RelFileNumber; /* Characters to allow for an OID in a relation path */ #define OIDCHARS 10 /* max chars printed by %u */ +#define RELNUMBERCHARS 20 /* max chars printed by UINT64_FORMAT */ /* * Stuff for fork names. diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h index 03c09fd13a4..2508a6193b0 100644 --- a/src/include/fe_utils/option_utils.h +++ b/src/include/fe_utils/option_utils.h @@ -22,5 +22,7 @@ extern void handle_help_version_opts(int argc, char *argv[], extern bool option_parse_int(const char *optarg, const char *optname, int min_range, int max_range, int *result); +extern bool option_parse_relfilenumber(const char *optarg, + const char *optname); #endif /* OPTION_UTILS_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 406db6be783..c3417b28ba9 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -92,29 +92,66 @@ typedef struct buftag { Oid spcOid; /* tablespace oid */ Oid dbOid; /* database oid */ - RelFileNumber relNumber; /* relation file number */ - ForkNumber forkNum; /* fork number */ + + /* + * relForkDetails[] stores the fork number in the high 8 bits of the first + * integer; the remaining 56 bits are used to store the relfilenmber. + * Expanding the relfilenumber to a full 64 bits would require widening + * the BufferTag, which is undesirable for performance reasons. We use + * two 32-bit values here rather than a single 64-bit value to avoid + * padding the struct out to a multiple of 8 bytes. + */ + uint32 relForkDetails[2]; BlockNumber blockNum; /* blknum relative to begin of reln */ } BufferTag; +/* High relNumber bits in relForkDetails[0] */ +#define BUFTAG_RELNUM_HIGH_BITS 24 + +/* Low relNumber bits in relForkDetails[1] */ +#define BUFTAG_RELNUM_LOW_BITS 32 + +/* Mask to fetch high bits of relNumber from relForkDetails[0] */ +#define BUFTAG_RELNUM_HIGH_MASK ((1U << BUFTAG_RELNUM_HIGH_BITS) - 1) + +/* Mask to fetch low bits of relNumber from relForkDetails[1] */ +#define BUFTAG_RELNUM_LOW_MASK 0XFFFFFFFF + static inline RelFileNumber BufTagGetRelNumber(const BufferTag *tag) { - return tag->relNumber; + uint64 relnum; + + relnum = ((uint64) tag->relForkDetails[0]) & BUFTAG_RELNUM_HIGH_MASK; + relnum = (relnum << BUFTAG_RELNUM_LOW_BITS) | tag->relForkDetails[1]; + + Assert(relnum <= MAX_RELFILENUMBER); + return (RelFileNumber) relnum; } static inline ForkNumber BufTagGetForkNum(const BufferTag *tag) { - return tag->forkNum; + ForkNumber ret; + + StaticAssertStmt(MAX_FORKNUM <= INT8_MAX, + "MAX_FORKNUM can't be greater than INT8_MAX"); + + ret = (int8) (tag->relForkDetails[0] >> BUFTAG_RELNUM_HIGH_BITS); + return ret; } static inline void BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber, ForkNumber forknum) { - tag->relNumber = relnumber; - tag->forkNum = forknum; + Assert(relnumber <= MAX_RELFILENUMBER); + Assert(forknum <= MAX_FORKNUM); + + tag->relForkDetails[0] = (relnumber >> BUFTAG_RELNUM_LOW_BITS) & + BUFTAG_RELNUM_HIGH_MASK; + tag->relForkDetails[0] |= (forknum << BUFTAG_RELNUM_HIGH_BITS); + tag->relForkDetails[1] = relnumber & BUFTAG_RELNUM_LOW_MASK; } static inline RelFileLocator @@ -153,9 +190,9 @@ BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2) { return (tag1->spcOid == tag2->spcOid) && (tag1->dbOid == tag2->dbOid) && - (tag1->relNumber == tag2->relNumber) && - (tag1->blockNum == tag2->blockNum) && - (tag1->forkNum == tag2->forkNum); + (tag1->relForkDetails[0] == tag2->relForkDetails[0]) && + (tag1->relForkDetails[1] == tag2->relForkDetails[1]) && + (tag1->blockNum == tag2->blockNum); } static inline bool diff --git a/src/include/storage/relfilelocator.h b/src/include/storage/relfilelocator.h index 10f41f3abb3..ef904644fa4 100644 --- a/src/include/storage/relfilelocator.h +++ b/src/include/storage/relfilelocator.h @@ -32,10 +32,11 @@ * Nonzero dbOid values correspond to pg_database.oid. * * relNumber identifies the specific relation. relNumber corresponds to - * pg_class.relfilenode (NOT pg_class.oid, because we need to be able - * to assign new physical files to relations in some situations). - * Notice that relNumber is only unique within a database in a particular - * tablespace. + * pg_class.relfilenode. Notice that relNumber values are assigned by + * GetNewRelFileNumber(), which will only ever assign the same value once + * during the lifetime of a cluster. However, since CREATE DATABASE duplicates + * the relfilenumbers of the template database, the values are in practice only + * unique within a database, not globally. * * Note: spcOid must be GLOBALTABLESPACE_OID if and only if dbOid is * zero. We support shared relations only in the "global" tablespace. @@ -75,6 +76,9 @@ typedef struct RelFileLocatorBackend BackendId backend; } RelFileLocatorBackend; +#define SizeOfRelFileLocatorBackend \ + (offsetof(RelFileLocatorBackend, backend) + sizeof(BackendId)) + #define RelFileLocatorBackendIsTemp(rlocator) \ ((rlocator).backend != InvalidBackendId) diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out index 346f594ad02..86666b83ae5 100644 --- a/src/test/regress/expected/alter_table.out +++ b/src/test/regress/expected/alter_table.out @@ -2164,9 +2164,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -2175,10 +2174,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+--------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | t | own | child 0 index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | t | orig | child 0 index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | t | orig | child 1 index at_partitioned_id_name_key | t | none | parent index (6 rows) @@ -2198,9 +2197,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -2209,10 +2207,10 @@ select relname, relname | orig_oid | storage | desc ------------------------------+----------+---------+-------------- at_partitioned | t | none | - at_partitioned_0 | t | own | - at_partitioned_0_id_name_key | f | own | parent index - at_partitioned_1 | t | own | - at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_0 | t | orig | + at_partitioned_0_id_name_key | f | new | parent index + at_partitioned_1 | t | orig | + at_partitioned_1_id_name_key | f | new | parent index at_partitioned_id_name_key | f | none | parent index (6 rows) @@ -2560,7 +2558,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out index 91f25717b5a..0a35f333f63 100644 --- a/src/test/regress/expected/fast_default.out +++ b/src/test/regress/expected/fast_default.out @@ -3,8 +3,8 @@ -- SET search_path = fast_default; CREATE SCHEMA fast_default; -CREATE TABLE m(id OID); -INSERT INTO m VALUES (NULL::OID); +CREATE TABLE m(id BIGINT); +INSERT INTO m VALUES (NULL::BIGINT); CREATE FUNCTION set(tabname name) RETURNS VOID AS $$ BEGIN diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out index 215eb899be3..af57470f93e 100644 --- a/src/test/regress/expected/oidjoins.out +++ b/src/test/regress/expected/oidjoins.out @@ -74,11 +74,11 @@ NOTICE: checking pg_type {typcollation} => pg_collation {oid} NOTICE: checking pg_attribute {attrelid} => pg_class {oid} NOTICE: checking pg_attribute {atttypid} => pg_type {oid} NOTICE: checking pg_attribute {attcollation} => pg_collation {oid} +NOTICE: checking pg_class {relam} => pg_am {oid} NOTICE: checking pg_class {relnamespace} => pg_namespace {oid} NOTICE: checking pg_class {reltype} => pg_type {oid} NOTICE: checking pg_class {reloftype} => pg_type {oid} NOTICE: checking pg_class {relowner} => pg_authid {oid} -NOTICE: checking pg_class {relam} => pg_am {oid} NOTICE: checking pg_class {reltablespace} => pg_tablespace {oid} NOTICE: checking pg_class {reltoastrelid} => pg_class {oid} NOTICE: checking pg_class {relrewrite} => pg_class {oid} diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql index 9f773aeeb94..a67eb5f982a 100644 --- a/src/test/regress/sql/alter_table.sql +++ b/src/test/regress/sql/alter_table.sql @@ -1478,9 +1478,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -1499,9 +1498,8 @@ select relname, c.oid = oldoid as orig_oid, case relfilenode when 0 then 'none' - when c.oid then 'own' when oldfilenode then 'orig' - else 'OTHER' + else 'new' end as storage, obj_description(c.oid, 'pg_class') as desc from pg_class c left join old_oids using (relname) @@ -1641,7 +1639,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) RETURNS boolean LANGUAGE plpgsql AS $$ DECLARE - v_relfilenode oid; + v_relfilenode int8; BEGIN v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; diff --git a/src/test/regress/sql/fast_default.sql b/src/test/regress/sql/fast_default.sql index 16a3b7ca51d..819ec40fdaf 100644 --- a/src/test/regress/sql/fast_default.sql +++ b/src/test/regress/sql/fast_default.sql @@ -4,8 +4,8 @@ SET search_path = fast_default; CREATE SCHEMA fast_default; -CREATE TABLE m(id OID); -INSERT INTO m VALUES (NULL::OID); +CREATE TABLE m(id BIGINT); +INSERT INTO m VALUES (NULL::BIGINT); CREATE FUNCTION set(tabname name) RETURNS VOID AS $$