diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index d74b3e853c6..4d88eba5e3a 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -6,8 +6,8 @@ OBJS = \
pg_buffercache_pages.o
EXTENSION = pg_buffercache
-DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
- pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql
+DATA = pg_buffercache--1.0--1.1.sql pg_buffercache--1.1--1.2.sql pg_buffercache--1.2.sql \
+ pg_buffercache--1.2--1.3.sql pg_buffercache--1.3--1.4.sql
PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql
new file mode 100644
index 00000000000..50956b195a8
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql
@@ -0,0 +1,30 @@
+/* contrib/pg_buffercache/pg_buffercache--1.3--1.4.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.4'" to load this file. \quit
+
+/* First we have to remove them from the extension */
+ALTER EXTENSION pg_buffercache DROP VIEW pg_buffercache;
+ALTER EXTENSION pg_buffercache DROP FUNCTION pg_buffercache_pages();
+
+/* Then we can drop them */
+DROP VIEW pg_buffercache;
+DROP FUNCTION pg_buffercache_pages();
+
+/* Now redefine */
+CREATE FUNCTION pg_buffercache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages_v1_4'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE VIEW pg_buffercache AS
+ SELECT P.* FROM pg_buffercache_pages() AS P
+ (bufferid integer, relfilenode int8, reltablespace oid, reldatabase oid,
+ relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+ pinning_backends int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 8c060ae9abf..a82ae5f9bb5 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
# pg_buffercache extension
comment = 'examine the shared buffer cache'
-default_version = '1.3'
+default_version = '1.4'
module_pathname = '$libdir/pg_buffercache'
relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index c5754ea9fa5..a45f240499a 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -59,9 +59,10 @@ typedef struct
* relation node/tablespace/database/blocknum and dirty indicator.
*/
PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_pages_v1_4);
-Datum
-pg_buffercache_pages(PG_FUNCTION_ARGS)
+static Datum
+pg_buffercache_pages_internal(PG_FUNCTION_ARGS, Oid rfn_typid)
{
FuncCallContext *funcctx;
Datum result;
@@ -103,7 +104,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
INT4OID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
- OIDOID, -1, 0);
+ rfn_typid, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
OIDOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
@@ -209,7 +210,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
}
else
{
- values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
+ if (rfn_typid == INT8OID)
+ values[1] =
+ Int64GetDatum((int64) fctx->record[i].relfilenumber);
+ else
+ {
+ Assert(rfn_typid == OIDOID);
+
+ if (fctx->record[i].relfilenumber > OID_MAX)
+ ereport(ERROR,
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("relfilenode %llu is too large to be represented as an OID",
+ (unsigned long long) fctx->record[i].relfilenumber),
+ errhint("Upgrade the extension using ALTER EXTENSION pg_buffercache UPDATE"));
+
+ values[1] =
+ ObjectIdGetDatum((Oid) fctx->record[i].relfilenumber);
+ }
+
nulls[1] = false;
values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
nulls[2] = false;
@@ -237,3 +255,16 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
else
SRF_RETURN_DONE(funcctx);
}
+
+/* entry point for old extension version */
+Datum
+pg_buffercache_pages(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_pages_internal(fcinfo, OIDOID);
+}
+
+Datum
+pg_buffercache_pages_v1_4(PG_FUNCTION_ARGS)
+{
+ return pg_buffercache_pages_internal(fcinfo, INT8OID);
+}
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index c8d673a20e3..31caf101a94 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -345,7 +345,7 @@ apw_load_buffers(void)
{
unsigned forknum;
- if (fscanf(file, "%u,%u,%u,%u,%u\n", &blkinfo[i].database,
+ if (fscanf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n", &blkinfo[i].database,
&blkinfo[i].tablespace, &blkinfo[i].filenumber,
&forknum, &blkinfo[i].blocknum) != 5)
ereport(ERROR,
@@ -669,7 +669,7 @@ apw_dump_now(bool is_bgworker, bool dump_unlogged)
{
CHECK_FOR_INTERRUPTS();
- ret = fprintf(file, "%u,%u,%u,%u,%u\n",
+ ret = fprintf(file, "%u,%u," UINT64_FORMAT ",%u,%u\n",
block_info_array[i].database,
block_info_array[i].tablespace,
block_info_array[i].filenumber,
diff --git a/contrib/pg_walinspect/expected/pg_walinspect.out b/contrib/pg_walinspect/expected/pg_walinspect.out
index a1ee743457c..e9b06ed3af2 100644
--- a/contrib/pg_walinspect/expected/pg_walinspect.out
+++ b/contrib/pg_walinspect/expected/pg_walinspect.out
@@ -54,9 +54,9 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1');
-- ===================================================================
-- Test for filtering out WAL records of a particular table
-- ===================================================================
-SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
+SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
- WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
+ WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
ok
----
t
diff --git a/contrib/pg_walinspect/sql/pg_walinspect.sql b/contrib/pg_walinspect/sql/pg_walinspect.sql
index 1b265ea7bcc..53938341257 100644
--- a/contrib/pg_walinspect/sql/pg_walinspect.sql
+++ b/contrib/pg_walinspect/sql/pg_walinspect.sql
@@ -39,10 +39,10 @@ SELECT COUNT(*) >= 0 AS ok FROM pg_get_wal_stats_till_end_of_wal(:'wal_lsn1');
-- Test for filtering out WAL records of a particular table
-- ===================================================================
-SELECT oid AS sample_tbl_oid FROM pg_class WHERE relname = 'sample_tbl' \gset
+SELECT relfilenode AS sample_tbl_relfilenode FROM pg_class WHERE relname = 'sample_tbl' \gset
SELECT COUNT(*) >= 1 AS ok FROM pg_get_wal_records_info(:'wal_lsn1', :'wal_lsn2')
- WHERE block_ref LIKE concat('%', :'sample_tbl_oid', '%') AND resource_manager = 'Heap';
+ WHERE block_ref LIKE concat('%', :'sample_tbl_relfilenode', '%') AND resource_manager = 'Heap';
-- ===================================================================
-- Test for filtering out WAL records based on resource_manager and
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 00f833d210e..40d4e9c35e6 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1984,7 +1984,7 @@ SCRAM-SHA-256$<iteration count>:&l
- relfilenodeoid
+ relfilenodeint8
Name of the on-disk file of this relation; zero means this
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 546213fa931..d8718ed61e6 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -25210,6 +25210,11 @@ SELECT collation for ('foo' COLLATE "de_DE");
timestamp with time zone
+
+ next_relfilenumber
+ timestamp with time zone
+
+
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index a06fd3e26de..e2222655804 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -62,7 +62,7 @@
- relfilenodeoid
+ relfilenodeint8
(references pg_class.relfilenode)
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index e5b9f3f1ffa..d9e9b0f43ee 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -217,11 +217,12 @@ with the suffix _init (see ).
-Note that while a table's filenode often matches its OID, this is
-not necessarily the case; some operations, like
-TRUNCATE, REINDEX, CLUSTER and some forms
-of ALTER TABLE, can change the filenode while preserving the OID.
-Avoid assuming that filenode and table OID are the same.
+Note that a table's filenode will normally be different than the OID. For
+system tables, the initial filenode will be equal to the table OID, but it will
+be different if the table has ever been subjected to a rewriting operation,
+such as TRUNCATE, REINDEX,
+CLUSTER or some forms of ALTER TABLE.
+For user tables, even the initial filenode will be different than the table OID.
Also, for certain system catalogs including pg_class itself,
pg_class.relfilenode contains zero. The
actual filenode number of these catalogs is stored in a lower-level data
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 41b92115bff..bc093f2a887 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -100,7 +100,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
BlockNumber blknum;
BufferGetTag(buffer, &locator, &forknum, &blknum);
- elog(ERROR, "failed to add item to index page in %u/%u/%u",
+ elog(ERROR, "failed to add item to index page in %u/%u/" UINT64_FORMAT,
locator.spcOid, locator.dbOid, locator.relNumber);
}
}
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index 7dd3c1d500f..d1c8a24d66f 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -26,7 +26,7 @@ out_gistxlogPageUpdate(StringInfo buf, gistxlogPageUpdate *xlrec)
static void
out_gistxlogPageReuse(StringInfo buf, gistxlogPageReuse *xlrec)
{
- appendStringInfo(buf, "rel %u/%u/%u; blk %u; latestRemovedXid %u:%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; blk %u; latestRemovedXid %u:%u",
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber, xlrec->block,
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c
index 923d3bc43df..70bd49303a9 100644
--- a/src/backend/access/rmgrdesc/heapdesc.c
+++ b/src/backend/access/rmgrdesc/heapdesc.c
@@ -169,7 +169,7 @@ heap2_desc(StringInfo buf, XLogReaderState *record)
{
xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec;
- appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; tid %u/%u",
xlrec->target_locator.spcOid,
xlrec->target_locator.dbOid,
xlrec->target_locator.relNumber,
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index 4843cd530df..6192a7ba841 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -100,7 +100,7 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{
xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
- appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT "; latestRemovedXid %u:%u",
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber,
EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
diff --git a/src/backend/access/rmgrdesc/seqdesc.c b/src/backend/access/rmgrdesc/seqdesc.c
index b3845f93bff..df72caf1768 100644
--- a/src/backend/access/rmgrdesc/seqdesc.c
+++ b/src/backend/access/rmgrdesc/seqdesc.c
@@ -25,7 +25,7 @@ seq_desc(StringInfo buf, XLogReaderState *record)
xl_seq_rec *xlrec = (xl_seq_rec *) rec;
if (info == XLOG_SEQ_LOG)
- appendStringInfo(buf, "rel %u/%u/%u",
+ appendStringInfo(buf, "rel %u/%u/" UINT64_FORMAT,
xlrec->locator.spcOid, xlrec->locator.dbOid,
xlrec->locator.relNumber);
}
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 3fd7185f217..84a826bf49c 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -45,8 +45,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
CheckPoint *checkpoint = (CheckPoint *) rec;
appendStringInfo(buf, "redo %X/%X; "
- "tli %u; prev tli %u; fpw %s; xid %u:%u; oid %u; multi %u; offset %u; "
- "oldest xid %u in DB %u; oldest multi %u in DB %u; "
+ "tli %u; prev tli %u; fpw %s; xid %u:%u; relfilenumber " UINT64_FORMAT ";oid %u; "
+ "multi %u; offset %u; oldest xid %u in DB %u; oldest multi %u in DB %u; "
"oldest/newest commit timestamp xid: %u/%u; "
"oldest running xid %u; %s",
LSN_FORMAT_ARGS(checkpoint->redo),
@@ -55,6 +55,7 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
checkpoint->fullPageWrites ? "true" : "false",
EpochFromFullTransactionId(checkpoint->nextXid),
XidFromFullTransactionId(checkpoint->nextXid),
+ checkpoint->nextRelFileNumber,
checkpoint->nextOid,
checkpoint->nextMulti,
checkpoint->nextMultiOffset,
@@ -74,6 +75,13 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
memcpy(&nextOid, rec, sizeof(Oid));
appendStringInfo(buf, "%u", nextOid);
}
+ else if (info == XLOG_NEXT_RELFILENUMBER)
+ {
+ RelFileNumber nextRelFileNumber;
+
+ memcpy(&nextRelFileNumber, rec, sizeof(RelFileNumber));
+ appendStringInfo(buf, UINT64_FORMAT, nextRelFileNumber);
+ }
else if (info == XLOG_RESTORE_POINT)
{
xl_restore_point *xlrec = (xl_restore_point *) rec;
@@ -169,6 +177,9 @@ xlog_identify(uint8 info)
case XLOG_NEXTOID:
id = "NEXTOID";
break;
+ case XLOG_NEXT_RELFILENUMBER:
+ id = "NEXT_RELFILENUMBER";
+ break;
case XLOG_SWITCH:
id = "SWITCH";
break;
@@ -237,7 +248,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
appendStringInfoChar(buf, ' ');
appendStringInfo(buf,
- "blkref #%d: rel %u/%u/%u fork %s blk %u",
+ "blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forkNames[forknum],
@@ -297,7 +308,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
if (forknum != MAIN_FORKNUM)
{
appendStringInfo(buf,
- ", blkref #%d: rel %u/%u/%u fork %s blk %u",
+ ", blkref #%d: rel %u/%u/" UINT64_FORMAT " fork %s blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forkNames[forknum],
@@ -306,7 +317,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty,
else
{
appendStringInfo(buf,
- ", blkref #%d: rel %u/%u/%u blk %u",
+ ", blkref #%d: rel %u/%u/" UINT64_FORMAT " blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
blk);
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 72af6560600..91c2578f7a9 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -692,8 +692,9 @@ by having database restart search for files that don't have any committed
entry in pg_class, but that currently isn't done because of the possibility
of deleting data that is useful for forensic analysis of the crash.
Orphan files are harmless --- at worst they waste a bit of disk space ---
-because we check for on-disk collisions when allocating new relfilenumber
-OIDs. So cleaning up isn't really necessary.
+because the relfilenumber counter is monotonically increasing. The maximum
+value is 2^56-1, and there is no provision for wraparound. Thus, on-disk
+collisions aren't possible.
3. Deleting a table, which requires an unlink() that could fail.
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 849a7ce9d6d..f99c697c2f5 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -13,12 +13,16 @@
#include "postgres.h"
+#include
+
#include "access/clog.h"
#include "access/commit_ts.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/xact.h"
#include "access/xlogutils.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_tablespace.h"
#include "commands/dbcommands.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
@@ -30,6 +34,15 @@
/* Number of OIDs to prefetch (preallocate) per XLOG write */
#define VAR_OID_PREFETCH 8192
+/* Number of RelFileNumbers to be logged per XLOG write */
+#define VAR_RELNUMBER_PER_XLOG 512
+
+/*
+ * Need to log more if remaining logged RelFileNumbers are less than the
+ * threshold. Valid range could be between 0 to VAR_RELNUMBER_PER_XLOG - 1.
+ */
+#define VAR_RELNUMBER_NEW_XLOG_THRESHOLD 256
+
/* pointer to "variable cache" in shared memory (set up by shmem.c) */
VariableCache ShmemVariableCache = NULL;
@@ -521,8 +534,7 @@ ForceTransactionIdLimitUpdate(void)
* wide, counter wraparound will occur eventually, and therefore it is unwise
* to assume they are unique unless precautions are taken to make them so.
* Hence, this routine should generally not be used directly. The only direct
- * callers should be GetNewOidWithIndex() and GetNewRelFileNumber() in
- * catalog/catalog.c.
+ * caller should be GetNewOidWithIndex() in catalog/catalog.c.
*/
Oid
GetNewObjectId(void)
@@ -612,6 +624,199 @@ SetNextObjectId(Oid nextOid)
LWLockRelease(OidGenLock);
}
+/*
+ * GetNewRelFileNumber
+ *
+ * Similar to GetNewObjectId but instead of new Oid it generates new
+ * relfilenumber.
+ */
+RelFileNumber
+GetNewRelFileNumber(Oid reltablespace, char relpersistence)
+{
+ RelFileNumber result;
+ RelFileNumber nextRelFileNumber,
+ loggedRelFileNumber,
+ flushedRelFileNumber;
+
+ StaticAssertStmt(VAR_RELNUMBER_NEW_XLOG_THRESHOLD < VAR_RELNUMBER_PER_XLOG,
+ "VAR_RELNUMBER_NEW_XLOG_THRESHOLD must be smaller than VAR_RELNUMBER_PER_XLOG");
+
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot assign RelFileNumber during recovery");
+
+ if (IsBinaryUpgrade)
+ elog(ERROR, "cannot assign RelFileNumber during binary upgrade");
+
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+ nextRelFileNumber = ShmemVariableCache->nextRelFileNumber;
+ loggedRelFileNumber = ShmemVariableCache->loggedRelFileNumber;
+ flushedRelFileNumber = ShmemVariableCache->flushedRelFileNumber;
+
+ Assert(nextRelFileNumber <= flushedRelFileNumber);
+ Assert(flushedRelFileNumber <= loggedRelFileNumber);
+
+ /* check for the wraparound for the relfilenumber counter */
+ if (unlikely(nextRelFileNumber > MAX_RELFILENUMBER))
+ elog(ERROR, "relfilenumber is too large");
+
+ /*
+ * If the remaining logged relfilenumbers values are less than the
+ * threshold value then log more. Ideally, we can wait until all
+ * relfilenumbers have been consumed before logging more. Nevertheless, if
+ * we do that, we must immediately flush the logged wal record because we
+ * want to ensure that the nextRelFileNumber is always larger than any
+ * relfilenumber already in use on disk. And, to maintain that invariant,
+ * we must make sure that the record we log reaches the disk before any new
+ * files are created with the newly logged range.
+ *
+ * So in order to avoid flushing the wal immediately, we always log before
+ * consuming all the relfilenumber, and now we only have to flush the newly
+ * logged relfilenumber wal before consuming the relfilenumber from this
+ * new range. By the time we need to flush this wal, hopefully, those have
+ * already been flushed with some other XLogFlush operation.
+ */
+ if (loggedRelFileNumber - nextRelFileNumber <=
+ VAR_RELNUMBER_NEW_XLOG_THRESHOLD)
+ {
+ XLogRecPtr recptr;
+
+ loggedRelFileNumber = loggedRelFileNumber + VAR_RELNUMBER_PER_XLOG;
+ recptr = LogNextRelFileNumber(loggedRelFileNumber);
+ ShmemVariableCache->loggedRelFileNumber = loggedRelFileNumber;
+
+ /* remember for the future flush */
+ ShmemVariableCache->loggedRelFileNumberRecPtr = recptr;
+ }
+
+ /*
+ * If the nextRelFileNumber is already reached to the already flushed
+ * relfilenumber then flush the WAL for previously logged relfilenumber.
+ */
+ if (nextRelFileNumber >= flushedRelFileNumber)
+ {
+ XLogFlush(ShmemVariableCache->loggedRelFileNumberRecPtr);
+ ShmemVariableCache->flushedRelFileNumber = loggedRelFileNumber;
+ }
+
+ result = ShmemVariableCache->nextRelFileNumber;
+
+ /* we should never be using any relfilenumber outside the flushed range */
+ Assert(result <= ShmemVariableCache->flushedRelFileNumber);
+
+ (ShmemVariableCache->nextRelFileNumber)++;
+
+ LWLockRelease(RelFileNumberGenLock);
+
+ /*
+ * Because the RelFileNumber counter only ever increases and never wraps
+ * around, it should be impossible for the newly-allocated RelFileNumber to
+ * already be in use. But, if Asserts are enabled, double check that
+ * there's no main-fork relation file with the new RelFileNumber already on
+ * disk.
+ */
+#ifdef USE_ASSERT_CHECKING
+ {
+ RelFileLocatorBackend rlocator;
+ char *rpath;
+ BackendId backend;
+
+ switch (relpersistence)
+ {
+ case RELPERSISTENCE_TEMP:
+ backend = BackendIdForTempRelations();
+ break;
+ case RELPERSISTENCE_UNLOGGED:
+ case RELPERSISTENCE_PERMANENT:
+ backend = InvalidBackendId;
+ break;
+ default:
+ elog(ERROR, "invalid relpersistence: %c", relpersistence);
+ }
+
+ /* this logic should match RelationInitPhysicalAddr */
+ rlocator.locator.spcOid =
+ reltablespace ? reltablespace : MyDatabaseTableSpace;
+ rlocator.locator.dbOid = (reltablespace == GLOBALTABLESPACE_OID) ?
+ InvalidOid : MyDatabaseId;
+ rlocator.locator.relNumber = result;
+
+ /*
+ * The relpath will vary based on the backend ID, so we must
+ * initialize that properly here to make sure that any collisions
+ * based on filename are properly detected.
+ */
+ rlocator.backend = backend;
+
+ /* check for existing file of same name. */
+ rpath = relpath(rlocator, MAIN_FORKNUM);
+ Assert(access(rpath, F_OK) != 0);
+ }
+#endif
+
+ return result;
+}
+
+/*
+ * SetNextRelFileNumber
+ *
+ * This may only be called during pg_upgrade; it advances the RelFileNumber
+ * counter to the specified value if the current value is smaller than the
+ * input value.
+ */
+void
+SetNextRelFileNumber(RelFileNumber relnumber)
+{
+ /* safety check, we should never get this far in a HS standby */
+ if (RecoveryInProgress())
+ elog(ERROR, "cannot set RelFileNumber during recovery");
+
+ if (!IsBinaryUpgrade)
+ elog(ERROR, "RelFileNumber can be set only during binary upgrade");
+
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+
+ /*
+ * If previous assigned value of the nextRelFileNumber is already higher
+ * than the current value then nothing to be done. This is possible
+ * because during upgrade the objects are not created in relfilenumber
+ * order.
+ */
+ if (relnumber <= ShmemVariableCache->nextRelFileNumber)
+ {
+ LWLockRelease(RelFileNumberGenLock);
+ return;
+ }
+
+ /*
+ * If the new relfilenumber to be set is greater than or equal to already
+ * flushed relfilenumber then log more and flush immediately.
+ *
+ * (This is less efficient than GetNewRelFileNumber, which arranges to
+ * log some new relfilenumbers before the old batch is exhausted in the
+ * hope that a flush will happen in the background before any values are
+ * needed from the new batch. However, since thais is only used during
+ * binary upgrade, it shouldn't really matter.)
+ */
+ if (relnumber >= ShmemVariableCache->flushedRelFileNumber)
+ {
+ RelFileNumber newlogrelnum;
+
+ newlogrelnum = relnumber + VAR_RELNUMBER_PER_XLOG;
+ XLogFlush(LogNextRelFileNumber(newlogrelnum));
+
+ /* we have flushed whatever we have logged so no pending flush */
+ ShmemVariableCache->loggedRelFileNumber = newlogrelnum;
+ ShmemVariableCache->flushedRelFileNumber = newlogrelnum;
+ ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr;
+ }
+
+ ShmemVariableCache->nextRelFileNumber = relnumber;
+
+ LWLockRelease(RelFileNumberGenLock);
+}
+
/*
* StopGeneratingPinnedObjectIds
*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1dd6df0fe15..dff9b8d2366 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4712,6 +4712,7 @@ BootStrapXLOG(void)
checkPoint.nextXid =
FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
checkPoint.nextOid = FirstGenbkiObjectId;
+ checkPoint.nextRelFileNumber = FirstNormalRelFileNumber;
checkPoint.nextMulti = FirstMultiXactId;
checkPoint.nextMultiOffset = 0;
checkPoint.oldestXid = FirstNormalTransactionId;
@@ -4725,7 +4726,11 @@ BootStrapXLOG(void)
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
ShmemVariableCache->oidCount = 0;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumberRecPtr = InvalidXLogRecPtr;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
@@ -5191,7 +5196,10 @@ StartupXLOG(void)
/* initialize shared memory variables from the checkpoint record */
ShmemVariableCache->nextXid = checkPoint.nextXid;
ShmemVariableCache->nextOid = checkPoint.nextOid;
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
ShmemVariableCache->oidCount = 0;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
AdvanceOldestClogXid(checkPoint.oldestXid);
SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
@@ -6663,6 +6671,24 @@ CreateCheckPoint(int flags)
checkPoint.nextOid += ShmemVariableCache->oidCount;
LWLockRelease(OidGenLock);
+ /*
+ * If this is a shutdown checkpoint then we can safely start allocating
+ * relfilenumber from the nextRelFileNumber value after the restart because
+ * no one one else can use the relfilenumber beyond that number before the
+ * shutdown. OTOH, if it is a normal checkpoint then if there is a crash
+ * after this point then we might end up reusing the same relfilenumbers
+ * after the restart so we need to set the nextRelFileNumber to the already
+ * logged relfilenumber as no one will use number beyond this limit without
+ * logging again.
+ */
+ LWLockAcquire(RelFileNumberGenLock, LW_SHARED);
+ if (shutdown)
+ checkPoint.nextRelFileNumber = ShmemVariableCache->nextRelFileNumber;
+ else
+ checkPoint.nextRelFileNumber = ShmemVariableCache->loggedRelFileNumber;
+
+ LWLockRelease(RelFileNumberGenLock);
+
MultiXactGetCheckptMulti(shutdown,
&checkPoint.nextMulti,
&checkPoint.nextMultiOffset,
@@ -7540,6 +7566,24 @@ XLogPutNextOid(Oid nextOid)
*/
}
+/*
+ * Similar to the XLogPutNextOid but instead of writing NEXTOID log record it
+ * writes a NEXT_RELFILENUMBER log record. It also returns the XLogRecPtr of
+ * the currently logged relfilenumber record, so that the caller can flush it
+ * at the appropriate time.
+ */
+XLogRecPtr
+LogNextRelFileNumber(RelFileNumber nextrelnumber)
+{
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&nextrelnumber), sizeof(RelFileNumber));
+ recptr = XLogInsert(RM_XLOG_ID, XLOG_NEXT_RELFILENUMBER);
+
+ return recptr;
+}
+
/*
* Write an XLOG SWITCH record.
*
@@ -7755,6 +7799,17 @@ xlog_redo(XLogReaderState *record)
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
}
+ if (info == XLOG_NEXT_RELFILENUMBER)
+ {
+ RelFileNumber nextRelFileNumber;
+
+ memcpy(&nextRelFileNumber, XLogRecGetData(record), sizeof(RelFileNumber));
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextRelFileNumber = nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumber = nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = nextRelFileNumber;
+ LWLockRelease(RelFileNumberGenLock);
+ }
else if (info == XLOG_CHECKPOINT_SHUTDOWN)
{
CheckPoint checkPoint;
@@ -7769,6 +7824,11 @@ xlog_redo(XLogReaderState *record)
ShmemVariableCache->nextOid = checkPoint.nextOid;
ShmemVariableCache->oidCount = 0;
LWLockRelease(OidGenLock);
+ LWLockAcquire(RelFileNumberGenLock, LW_EXCLUSIVE);
+ ShmemVariableCache->nextRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->loggedRelFileNumber = checkPoint.nextRelFileNumber;
+ ShmemVariableCache->flushedRelFileNumber = checkPoint.nextRelFileNumber;
+ LWLockRelease(RelFileNumberGenLock);
MultiXactSetNextMXact(checkPoint.nextMulti,
checkPoint.nextMultiOffset);
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 8f5d4253320..cea38eccea6 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -613,7 +613,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, which creates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
@@ -636,7 +636,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, which truncates the relation",
xlrec->rlocator.spcOid,
xlrec->rlocator.dbOid,
xlrec->rlocator.relNumber,
@@ -735,7 +735,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
+ "suppressing all prefetch in relation %u/%u/" UINT64_FORMAT " until %X/%X is replayed, because the relation does not exist on disk",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
@@ -756,7 +756,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
+ "suppressing prefetch in relation %u/%u/" UINT64_FORMAT " from block %u until %X/%X is replayed, because the relation is too small",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
@@ -795,7 +795,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
* truncated beneath our feet?
*/
elog(ERROR,
- "could not prefetch relation %u/%u/%u block %u",
+ "could not prefetch relation %u/%u/" UINT64_FORMAT " block %u",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber,
@@ -934,7 +934,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
+ "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed),
filter->filter_from_block);
@@ -950,7 +950,7 @@ XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileLocator rlocator,
{
#ifdef XLOGPREFETCHER_DEBUG_LEVEL
elog(XLOGPREFETCHER_DEBUG_LEVEL,
- "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
+ "prefetch of %u/%u/" UINT64_FORMAT " block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, blockno,
LSN_FORMAT_ARGS(filter->filter_until_replayed));
#endif
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index b41e6826643..1026ce5dcf7 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -2228,14 +2228,14 @@ xlog_block_info(StringInfo buf, XLogReaderState *record)
continue;
if (forknum != MAIN_FORKNUM)
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", fork %u, blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid,
rlocator.relNumber,
forknum,
blk);
else
- appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
+ appendStringInfo(buf, "; blkref #%d: rel %u/%u/" UINT64_FORMAT ", blk %u",
block_id,
rlocator.spcOid, rlocator.dbOid,
rlocator.relNumber,
@@ -2433,7 +2433,7 @@ verifyBackupPageConsistency(XLogReaderState *record)
if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
{
elog(FATAL,
- "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+ "inconsistent page found, rel %u/%u/" UINT64_FORMAT ", forknum %u, blkno %u",
rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
forknum, blkno);
}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 563cba258dd..ffda2c210b7 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -619,17 +619,17 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator)
rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
/* We don't know the name of the relation; use relfilenumber instead */
- sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber);
+ sprintf(RelationGetRelationName(rel), UINT64_FORMAT, rlocator.relNumber);
/*
* We set up the lockRelId in case anything tries to lock the dummy
- * relation. Note that this is fairly bogus since relNumber may be
+ * relation. Note that this is fairly bogus since relNumber are completely
* different from the relation's OID. It shouldn't really matter though.
* In recovery, we are running by ourselves and can't have any lock
* conflicts. While syncing, we already hold AccessExclusiveLock.
*/
rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid;
- rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber;
+ rel->rd_lockInfo.lockRelId.relId = (Oid) rlocator.relNumber;
rel->rd_smgr = NULL;
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 411cac9be3f..1434bcdd85c 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1246,7 +1246,7 @@ sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
if (relForkNum != INIT_FORKNUM)
{
char initForkFile[MAXPGPATH];
- char relNumber[OIDCHARS + 1];
+ char relNumber[RELNUMBERCHARS + 1];
/*
* If any other type of fork, check if there is an init fork
diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index 2abd6b007a2..a9bd8ae008e 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -482,101 +482,6 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn)
return newOid;
}
-/*
- * GetNewRelFileNumber
- * Generate a new relfilenumber that is unique within the
- * database of the given tablespace.
- *
- * If the relfilenumber will also be used as the relation's OID, pass the
- * opened pg_class catalog, and this routine will guarantee that the result
- * is also an unused OID within pg_class. If the result is to be used only
- * as a relfilenumber for an existing relation, pass NULL for pg_class.
- *
- * As with GetNewOidWithIndex(), there is some theoretical risk of a race
- * condition, but it doesn't seem worth worrying about.
- *
- * Note: we don't support using this in bootstrap mode. All relations
- * created by bootstrap have preassigned OIDs, so there's no need.
- */
-RelFileNumber
-GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence)
-{
- RelFileLocatorBackend rlocator;
- char *rpath;
- bool collides;
- BackendId backend;
-
- /*
- * If we ever get here during pg_upgrade, there's something wrong; all
- * relfilenumber assignments during a binary-upgrade run should be
- * determined by commands in the dump script.
- */
- Assert(!IsBinaryUpgrade);
-
- switch (relpersistence)
- {
- case RELPERSISTENCE_TEMP:
- backend = BackendIdForTempRelations();
- break;
- case RELPERSISTENCE_UNLOGGED:
- case RELPERSISTENCE_PERMANENT:
- backend = InvalidBackendId;
- break;
- default:
- elog(ERROR, "invalid relpersistence: %c", relpersistence);
- return InvalidRelFileNumber; /* placate compiler */
- }
-
- /* This logic should match RelationInitPhysicalAddr */
- rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace;
- rlocator.locator.dbOid =
- (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ?
- InvalidOid : MyDatabaseId;
-
- /*
- * The relpath will vary based on the backend ID, so we must initialize
- * that properly here to make sure that any collisions based on filename
- * are properly detected.
- */
- rlocator.backend = backend;
-
- do
- {
- CHECK_FOR_INTERRUPTS();
-
- /* Generate the OID */
- if (pg_class)
- rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId,
- Anum_pg_class_oid);
- else
- rlocator.locator.relNumber = GetNewObjectId();
-
- /* Check for existing file of same name */
- rpath = relpath(rlocator, MAIN_FORKNUM);
-
- if (access(rpath, F_OK) == 0)
- {
- /* definite collision */
- collides = true;
- }
- else
- {
- /*
- * Here we have a little bit of a dilemma: if errno is something
- * other than ENOENT, should we declare a collision and loop? In
- * practice it seems best to go ahead regardless of the errno. If
- * there is a colliding file we will get an smgr failure when we
- * attempt to create the new relation file.
- */
- collides = false;
- }
-
- pfree(rpath);
- } while (collides);
-
- return rlocator.locator.relNumber;
-}
-
/*
* SQL callable interface for GetNewOidWithIndex(). Outside of initdb's
* direct insertions into catalog tables, and recovering from corruption, this
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 9a80ccdccdf..de01da198e3 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -341,11 +341,19 @@ heap_create(const char *relname,
else
{
/*
- * If relfilenumber is unspecified by the caller then create storage
- * with oid same as relid.
+ * If relfilenumber is unspecified by the caller then allocate a new
+ * one, except for system tables, for which we make the initial
+ * relfilenumber the same as the table OID. See the comments for
+ * FirstNormalRelFileNumber for an explanation of why we do this.
*/
if (!RelFileNumberIsValid(relfilenumber))
- relfilenumber = relid;
+ {
+ if (relid < FirstNormalObjectId)
+ relfilenumber = relid;
+ else
+ relfilenumber = GetNewRelFileNumber(reltablespace,
+ relpersistence);
+ }
}
/*
@@ -901,7 +909,7 @@ InsertPgClassTuple(Relation pg_class_desc,
values[Anum_pg_class_reloftype - 1] = ObjectIdGetDatum(rd_rel->reloftype);
values[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(rd_rel->relowner);
values[Anum_pg_class_relam - 1] = ObjectIdGetDatum(rd_rel->relam);
- values[Anum_pg_class_relfilenode - 1] = ObjectIdGetDatum(rd_rel->relfilenode);
+ values[Anum_pg_class_relfilenode - 1] = Int64GetDatum(rd_rel->relfilenode);
values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace);
values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages);
values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples);
@@ -1173,12 +1181,7 @@ heap_create_with_catalog(const char *relname,
if (shared_relation && reltablespace != GLOBALTABLESPACE_OID)
elog(ERROR, "shared relations must be placed in pg_global tablespace");
- /*
- * Allocate an OID for the relation, unless we were told what to use.
- *
- * The OID will be the relfilenumber as well, so make sure it doesn't
- * collide with either pg_class OIDs or existing physical files.
- */
+ /* Allocate an OID for the relation, unless we were told what to use. */
if (!OidIsValid(relid))
{
/* Use binary-upgrade override for pg_class.oid and relfilenumber */
@@ -1232,8 +1235,8 @@ heap_create_with_catalog(const char *relname,
}
if (!OidIsValid(relid))
- relid = GetNewRelFileNumber(reltablespace, pg_class_desc,
- relpersistence);
+ relid = GetNewOidWithIndex(pg_class_desc, ClassOidIndexId,
+ Anum_pg_class_oid);
}
/*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 61f1d3926a9..1fd40c42a3a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -898,12 +898,7 @@ index_create(Relation heapRelation,
collationObjectId,
classObjectId);
- /*
- * Allocate an OID for the index, unless we were told what to use.
- *
- * The OID will be the relfilenumber as well, so make sure it doesn't
- * collide with either pg_class OIDs or existing physical files.
- */
+ /* Allocate an OID for the index, unless we were told what to use. */
if (!OidIsValid(indexRelationId))
{
/* Use binary-upgrade override for pg_class.oid and relfilenumber */
@@ -935,8 +930,8 @@ index_create(Relation heapRelation,
}
else
{
- indexRelationId =
- GetNewRelFileNumber(tableSpaceId, pg_class, relpersistence);
+ indexRelationId = GetNewOidWithIndex(pg_class, ClassOidIndexId,
+ Anum_pg_class_oid);
}
}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index d708af19ed2..021e08580fb 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -968,6 +968,10 @@ smgr_redo(XLogReaderState *record)
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
SMgrRelation reln;
+ if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber)
+ elog(ERROR, "unexpected relnumber " UINT64_FORMAT " that is bigger than nextRelFileNumber " UINT64_FORMAT,
+ xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber);
+
reln = smgropen(xlrec->rlocator, InvalidBackendId);
smgrcreate(reln, xlrec->forkNum, true);
}
@@ -981,6 +985,10 @@ smgr_redo(XLogReaderState *record)
int nforks = 0;
bool need_fsm_vacuum = false;
+ if (xlrec->rlocator.relNumber > ShmemVariableCache->nextRelFileNumber)
+ elog(ERROR, "unexpected relnumber " UINT64_FORMAT "that is bigger than nextRelFileNumber " UINT64_FORMAT,
+ xlrec->rlocator.relNumber, ShmemVariableCache->nextRelFileNumber);
+
reln = smgropen(xlrec->rlocator, InvalidBackendId);
/*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 7d8a75d23c2..1b8e6d57294 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -14375,10 +14375,14 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
}
/*
- * Relfilenumbers are not unique in databases across tablespaces, so we
- * need to allocate a new one in the new tablespace.
- */
- newrelfilenumber = GetNewRelFileNumber(newTableSpace, NULL,
+ * Generate a new relfilenumber. We cannot reuse the old relfilenumber
+ * because of the possibility that that relation will be moved back to the
+ * original tablespace before the next checkpoint. At that point, the
+ * first segment of the main fork won't have been unlinked yet, and an
+ * attempt to create new relation storage with that same relfilenumber
+ * will fail.
+ */
+ newrelfilenumber = GetNewRelFileNumber(newTableSpace,
rel->rd_rel->relpersistence);
/* Open old and new relation */
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index b69ff37dbbd..cdd7986dfc3 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -267,7 +267,7 @@ CreateTableSpace(CreateTableSpaceStmt *stmt)
* parts.
*/
if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 +
- OIDCHARS + 1 + OIDCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH)
+ OIDCHARS + 1 + RELNUMBERCHARS + 1 + FORKNAMECHARS + 1 + OIDCHARS > MAXPGPATH)
ereport(ERROR,
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("tablespace location \"%s\" is too long",
diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl
index 81b8c184a90..f1fa8945135 100644
--- a/src/backend/nodes/gen_node_support.pl
+++ b/src/backend/nodes/gen_node_support.pl
@@ -961,12 +961,12 @@ _read${n}(void)
print $off "\tWRITE_UINT_FIELD($f);\n";
print $rff "\tREAD_UINT_FIELD($f);\n" unless $no_read;
}
- elsif ($t eq 'uint64')
+ elsif ($t eq 'uint64' || $t eq 'RelFileNumber')
{
print $off "\tWRITE_UINT64_FIELD($f);\n";
print $rff "\tREAD_UINT64_FIELD($f);\n" unless $no_read;
}
- elsif ($t eq 'Oid' || $t eq 'RelFileNumber')
+ elsif ($t eq 'Oid')
{
print $off "\tWRITE_OID_FIELD($f);\n";
print $rff "\tREAD_OID_FIELD($f);\n" unless $no_read;
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 2cc0ac9eb09..cdf19a9c204 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -154,6 +154,7 @@ xlog_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
break;
case XLOG_NOOP:
case XLOG_NEXTOID:
+ case XLOG_NEXT_RELFILENUMBER:
case XLOG_SWITCH:
case XLOG_BACKUP_END:
case XLOG_PARAMETER_CHANGE:
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 03d9c9c86a2..a0f398b458a 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -4932,7 +4932,7 @@ DisplayMapping(HTAB *tuplecid_data)
hash_seq_init(&hstat, tuplecid_data);
while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
{
- elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
+ elog(DEBUG3, "mapping: node: %u/%u/" UINT64_FORMAT " tid: %u/%u cmin: %u, cmax: %u",
ent->key.rlocator.dbOid,
ent->key.rlocator.spcOid,
ent->key.rlocator.relNumber,
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index 647c458b52e..c3faa68126a 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -31,7 +31,7 @@ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
typedef struct
{
- Oid reloid; /* hash key */
+ RelFileNumber relnumber; /* hash key */
} unlogged_relation_entry;
/*
@@ -184,10 +184,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
* need to be reset. Otherwise, this cleanup operation would be
* O(n^2).
*/
- ctl.keysize = sizeof(Oid);
+ ctl.keysize = sizeof(RelFileNumber);
ctl.entrysize = sizeof(unlogged_relation_entry);
ctl.hcxt = CurrentMemoryContext;
- hash = hash_create("unlogged relation OIDs", 32, &ctl,
+ hash = hash_create("unlogged relation RelFileNumbers", 32, &ctl,
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
/* Scan the directory. */
@@ -208,10 +208,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
continue;
/*
- * Put the OID portion of the name into the hash table, if it
- * isn't already.
+ * Put the RELFILENUMBER portion of the name into the hash table,
+ * if it isn't already.
*/
- ent.reloid = atooid(de->d_name);
+ ent.relnumber = atorelnumber(de->d_name);
(void) hash_search(hash, &ent, HASH_ENTER, NULL);
}
@@ -248,10 +248,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
continue;
/*
- * See whether the OID portion of the name shows up in the hash
- * table. If so, nuke it!
+ * See whether the RELFILENUMBER portion of the name shows up in
+ * the hash table. If so, nuke it!
*/
- ent.reloid = atooid(de->d_name);
+ ent.relnumber = atorelnumber(de->d_name);
if (hash_search(hash, &ent, HASH_FIND, NULL))
{
snprintf(rm_path, sizeof(rm_path), "%s/%s",
@@ -286,7 +286,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
{
ForkNumber forkNum;
int relnumchars;
- char relnumbuf[OIDCHARS + 1];
+ char relnumbuf[RELNUMBERCHARS + 1];
char srcpath[MAXPGPATH * 2];
char dstpath[MAXPGPATH];
@@ -329,7 +329,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
{
ForkNumber forkNum;
int relnumchars;
- char relnumbuf[OIDCHARS + 1];
+ char relnumbuf[RELNUMBERCHARS + 1];
char mainpath[MAXPGPATH];
/* Skip anything that doesn't look like a relation data file. */
@@ -372,8 +372,8 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
* for a non-temporary relation and false otherwise.
*
* NB: If this function returns true, the caller is entitled to assume that
- * *relnumchars has been set to a value no more than OIDCHARS, and thus
- * that a buffer of OIDCHARS+1 characters is sufficient to hold the
+ * *relnumchars has been set to a value no more than RELNUMBERCHARS, and thus
+ * that a buffer of RELNUMBERCHARS+1 characters is sufficient to hold the
* RelFileNumber portion of the filename. This is critical to protect against
* a possible buffer overrun.
*/
@@ -386,7 +386,7 @@ parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
/* Look for a non-empty string of digits (that isn't too long). */
for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
;
- if (pos == 0 || pos > OIDCHARS)
+ if (pos == 0 || pos > RELNUMBERCHARS)
return false;
*relnumchars = pos;
diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c
index af4dab7d2c7..1210be7470b 100644
--- a/src/backend/storage/freespace/fsmpage.c
+++ b/src/backend/storage/freespace/fsmpage.c
@@ -273,7 +273,7 @@ restart:
BlockNumber blknum;
BufferGetTag(buf, &rlocator, &forknum, &blknum);
- elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
+ elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/" UINT64_FORMAT,
blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber);
/* make sure we hold an exclusive lock */
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 6c7cf6c2956..3c5d0410795 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -53,3 +53,4 @@ XactTruncationLock 44
# 45 was XactTruncationLock until removal of BackendRandomLock
WrapLimitsVacuumLock 46
NotifyQueueTailLock 47
+RelFileNumberGenLock 48
\ No newline at end of file
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index a515bb36ac1..bed47f07d73 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -257,6 +257,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
* next checkpoint, we prevent reassignment of the relfilenumber until it's
* safe, because relfilenumber assignment skips over any existing file.
*
+ * XXX. Although all of this was true when relfilenumbers were 32 bits wide,
+ * they are now 56 bits wide and do not wrap around, so in the future we can
+ * change the code to immediately unlink the first segment of the relation
+ * along with all the others. We still do reuse relfilenumbers when createdb()
+ * is performed using the file-copy method or during movedb(), but the scenario
+ * described above can only happen when creating a new relation.
+ *
* We do not need to go through this dance for temp relations, though, because
* we never make WAL entries for temp rels, and so a temp rel poses no threat
* to the health of a regular rel that has taken over its relfilenumber.
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c1a5febcbfd..ed46ac3f44e 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -154,7 +154,7 @@ smgropen(RelFileLocator rlocator, BackendId backend)
/* First time through: initialize the hash table */
HASHCTL ctl;
- ctl.keysize = sizeof(RelFileLocatorBackend);
+ ctl.keysize = SizeOfRelFileLocatorBackend;
ctl.entrysize = sizeof(SMgrRelationData);
SMgrRelationHash = hash_create("smgr relation table", 400,
&ctl, HASH_ELEM | HASH_BLOBS);
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 34efa121b40..9f70f3526c9 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -878,7 +878,7 @@ pg_relation_filenode(PG_FUNCTION_ARGS)
if (!RelFileNumberIsValid(result))
PG_RETURN_NULL();
- PG_RETURN_OID(result);
+ PG_RETURN_INT64(result);
}
/*
@@ -898,9 +898,12 @@ Datum
pg_filenode_relation(PG_FUNCTION_ARGS)
{
Oid reltablespace = PG_GETARG_OID(0);
- RelFileNumber relfilenumber = PG_GETARG_OID(1);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(1);
Oid heaprel;
+ /* check whether the relfilenumber is within a valid range */
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
+
/* test needed so RelidByRelfilenumber doesn't misbehave */
if (!RelFileNumberIsValid(relfilenumber))
PG_RETURN_NULL();
diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c
index 797f5f539af..fc2faed9a7d 100644
--- a/src/backend/utils/adt/pg_upgrade_support.c
+++ b/src/backend/utils/adt/pg_upgrade_support.c
@@ -17,6 +17,7 @@
#include "catalog/pg_type.h"
#include "commands/extension.h"
#include "miscadmin.h"
+#include "storage/relfilelocator.h"
#include "utils/array.h"
#include "utils/builtins.h"
@@ -98,10 +99,12 @@ binary_upgrade_set_next_heap_pg_class_oid(PG_FUNCTION_ARGS)
Datum
binary_upgrade_set_next_heap_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_heap_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
@@ -120,10 +123,12 @@ binary_upgrade_set_next_index_pg_class_oid(PG_FUNCTION_ARGS)
Datum
binary_upgrade_set_next_index_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_index_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
@@ -142,10 +147,12 @@ binary_upgrade_set_next_toast_pg_class_oid(PG_FUNCTION_ARGS)
Datum
binary_upgrade_set_next_toast_relfilenode(PG_FUNCTION_ARGS)
{
- RelFileNumber relfilenumber = PG_GETARG_OID(0);
+ RelFileNumber relfilenumber = PG_GETARG_INT64(0);
CHECK_IS_BINARY_UPGRADE;
+ CHECK_RELFILENUMBER_RANGE(relfilenumber);
binary_upgrade_next_toast_pg_class_relfilenumber = relfilenumber;
+ SetNextRelFileNumber(relfilenumber + 1);
PG_RETURN_VOID();
}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 00dc0f24037..6f4e96dd33b 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3712,7 +3712,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
{
/* Allocate a new relfilenumber */
newrelfilenumber = GetNewRelFileNumber(relation->rd_rel->reltablespace,
- NULL, persistence);
+ persistence);
}
else if (relation->rd_rel->relkind == RELKIND_INDEX)
{
diff --git a/src/backend/utils/cache/relfilenumbermap.c b/src/backend/utils/cache/relfilenumbermap.c
index c4245d5ccdd..2e0acf98f20 100644
--- a/src/backend/utils/cache/relfilenumbermap.c
+++ b/src/backend/utils/cache/relfilenumbermap.c
@@ -196,7 +196,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
/* set scan arguments */
skey[0].sk_argument = ObjectIdGetDatum(reltablespace);
- skey[1].sk_argument = ObjectIdGetDatum(relfilenumber);
+ skey[1].sk_argument = Int64GetDatum((int64) relfilenumber);
scandesc = systable_beginscan(relation,
ClassTblspcRelfilenodeIndexId,
@@ -213,7 +213,7 @@ RelidByRelfilenumber(Oid reltablespace, RelFileNumber relfilenumber)
if (found)
elog(ERROR,
- "unexpected duplicate for tablespace %u, relfilenumber %u",
+ "unexpected duplicate for tablespace %u, relfilenumber " UINT64_FORMAT,
reltablespace, relfilenumber);
found = true;
diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c
index 781f8b87580..d441cd97e2f 100644
--- a/src/backend/utils/misc/pg_controldata.c
+++ b/src/backend/utils/misc/pg_controldata.c
@@ -79,8 +79,8 @@ pg_control_system(PG_FUNCTION_ARGS)
Datum
pg_control_checkpoint(PG_FUNCTION_ARGS)
{
- Datum values[18];
- bool nulls[18];
+ Datum values[19];
+ bool nulls[19];
TupleDesc tupdesc;
HeapTuple htup;
ControlFileData *ControlFile;
@@ -129,6 +129,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 18, "checkpoint_time",
TIMESTAMPTZOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 19, "next_relfilenumber",
+ INT8OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
/* Read the control file. */
@@ -202,6 +204,9 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
values[17] = TimestampTzGetDatum(time_t_to_timestamptz(ControlFile->checkPointCopy.time));
nulls[17] = false;
+ values[18] = Int64GetDatum((int64) ControlFile->checkPointCopy.nextRelFileNumber);
+ nulls[18] = false;
+
htup = heap_form_tuple(tupdesc, values, nulls);
PG_RETURN_DATUM(HeapTupleGetDatum(htup));
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 324ccf77834..ddb5ec117f2 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -485,9 +485,7 @@ main(int argc, char *argv[])
mode = PG_MODE_ENABLE;
break;
case 'f':
- if (!option_parse_int(optarg, "-f/--filenode", 0,
- INT_MAX,
- NULL))
+ if (!option_parse_relfilenumber(optarg, "-f/--filenode"))
exit(1);
only_filenode = pstrdup(optarg);
break;
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index c390ec51ce9..2f0e91fc2f9 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -250,6 +250,8 @@ main(int argc, char *argv[])
printf(_("Latest checkpoint's NextXID: %u:%u\n"),
EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid),
XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid));
+ printf(_("Latest checkpoint's NextRelFileNumber:%llu\n"),
+ (unsigned long long) ControlFile->checkPointCopy.nextRelFileNumber);
printf(_("Latest checkpoint's NextOID: %u\n"),
ControlFile->checkPointCopy.nextOid);
printf(_("Latest checkpoint's NextMultiXactId: %u\n"),
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index bd9b066e4eb..9f78971cab5 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -3184,15 +3184,15 @@ dumpDatabase(Archive *fout)
atooid(PQgetvalue(lo_res, i, ii_oid)));
oid = atooid(PQgetvalue(lo_res, i, ii_oid));
- relfilenumber = atooid(PQgetvalue(lo_res, i, ii_relfilenode));
+ relfilenumber = atorelnumber(PQgetvalue(lo_res, i, ii_relfilenode));
if (oid == LargeObjectRelationId)
appendPQExpBuffer(loOutQry,
- "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
else if (oid == LargeObjectLOidPNIndexId)
appendPQExpBuffer(loOutQry,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
}
@@ -4877,16 +4877,16 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
relkind = *PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "relkind"));
- relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "relfilenode")));
+ relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "relfilenode")));
toast_oid = atooid(PQgetvalue(upgrade_res, 0,
PQfnumber(upgrade_res, "reltoastrelid")));
- toast_relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "toast_relfilenode")));
+ toast_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "toast_relfilenode")));
toast_index_oid = atooid(PQgetvalue(upgrade_res, 0,
PQfnumber(upgrade_res, "indexrelid")));
- toast_index_relfilenumber = atooid(PQgetvalue(upgrade_res, 0,
- PQfnumber(upgrade_res, "toast_index_relfilenode")));
+ toast_index_relfilenumber = atorelnumber(PQgetvalue(upgrade_res, 0,
+ PQfnumber(upgrade_res, "toast_index_relfilenode")));
appendPQExpBufferStr(upgrade_buffer,
"\n-- For binary upgrade, must preserve pg_class oids and relfilenodes\n");
@@ -4904,7 +4904,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
*/
if (RelFileNumberIsValid(relfilenumber) && relkind != RELKIND_PARTITIONED_TABLE)
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_heap_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
/*
@@ -4918,7 +4918,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
"SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n",
toast_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_toast_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
toast_relfilenumber);
/* every toast table has an index */
@@ -4926,7 +4926,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
"SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n",
toast_index_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
toast_index_relfilenumber);
}
@@ -4939,7 +4939,7 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
"SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n",
pg_class_oid);
appendPQExpBuffer(upgrade_buffer,
- "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('%u'::pg_catalog.oid);\n",
+ "SELECT pg_catalog.binary_upgrade_set_next_index_relfilenode('" UINT64_FORMAT "'::pg_catalog.int8);\n",
relfilenumber);
}
diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 269ed6446e6..197ec0eac91 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -538,7 +538,7 @@ isRelDataFile(const char *path)
segNo = 0;
matched = false;
- nmatch = sscanf(path, "global/%u.%u", &rlocator.relNumber, &segNo);
+ nmatch = sscanf(path, "global/" UINT64_FORMAT ".%u", &rlocator.relNumber, &segNo);
if (nmatch == 1 || nmatch == 2)
{
rlocator.spcOid = GLOBALTABLESPACE_OID;
@@ -547,7 +547,7 @@ isRelDataFile(const char *path)
}
else
{
- nmatch = sscanf(path, "base/%u/%u.%u",
+ nmatch = sscanf(path, "base/%u/" UINT64_FORMAT ".%u",
&rlocator.dbOid, &rlocator.relNumber, &segNo);
if (nmatch == 2 || nmatch == 3)
{
@@ -556,7 +556,7 @@ isRelDataFile(const char *path)
}
else
{
- nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/%u.%u",
+ nmatch = sscanf(path, "pg_tblspc/%u/" TABLESPACE_VERSION_DIRECTORY "/%u/" UINT64_FORMAT ".%u",
&rlocator.spcOid, &rlocator.dbOid, &rlocator.relNumber,
&segNo);
if (nmatch == 3 || nmatch == 4)
diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c
index f18cf971202..0c712a62669 100644
--- a/src/bin/pg_upgrade/info.c
+++ b/src/bin/pg_upgrade/info.c
@@ -527,7 +527,8 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo)
relname = PQgetvalue(res, relnum, i_relname);
curr->relname = pg_strdup(relname);
- curr->relfilenumber = atooid(PQgetvalue(res, relnum, i_relfilenumber));
+ curr->relfilenumber =
+ atorelnumber(PQgetvalue(res, relnum, i_relfilenumber));
curr->tblsp_alloc = false;
/* Is the tablespace oid non-default? */
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 115faa222e3..7ab1bcc9c8d 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -15,10 +15,8 @@
* oids are the same between old and new clusters. This is important
* because toast oids are stored as toast pointers in user tables.
*
- * While pg_class.oid and pg_class.relfilenode are initially the same in a
- * cluster, they can diverge due to CLUSTER, REINDEX, or VACUUM FULL. We
- * control assignments of pg_class.relfilenode because we want the filenames
- * to match between the old and new cluster.
+ * We control assignments of pg_class.relfilenode because we want the
+ * filenames to match between the old and new cluster.
*
* We control assignment of pg_tablespace.oid because we want the oid to match
* between the old and new cluster.
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index c3f3d6bc0af..529267d670a 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -190,14 +190,14 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
else
snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno);
- snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s",
+ snprintf(old_file, sizeof(old_file), "%s%s/%u/" UINT64_FORMAT "%s%s",
map->old_tablespace,
map->old_tablespace_suffix,
map->db_oid,
map->relfilenumber,
type_suffix,
extent_suffix);
- snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s",
+ snprintf(new_file, sizeof(new_file), "%s%s/%u/" UINT64_FORMAT "%s%s",
map->new_tablespace,
map->new_tablespace_suffix,
map->db_oid,
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 9993378ca58..6fdc7dcf529 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -884,7 +884,7 @@ main(int argc, char **argv)
}
break;
case 'R':
- if (sscanf(optarg, "%u/%u/%u",
+ if (sscanf(optarg, "%u/%u/" UINT64_FORMAT,
&config.filter_by_relation.spcOid,
&config.filter_by_relation.dbOid,
&config.filter_by_relation.relNumber) != 3 ||
diff --git a/src/bin/scripts/t/090_reindexdb.pl b/src/bin/scripts/t/090_reindexdb.pl
index e706d686e39..de5cee6fa08 100644
--- a/src/bin/scripts/t/090_reindexdb.pl
+++ b/src/bin/scripts/t/090_reindexdb.pl
@@ -40,7 +40,7 @@ my $toast_index = $node->safe_psql('postgres',
# REINDEX operations. A set of relfilenodes is saved from the catalogs
# and then compared with pg_class.
$node->safe_psql('postgres',
- 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode oid);'
+ 'CREATE TABLE index_relfilenodes (parent regclass, indname text, indoid oid, relfilenode int8);'
);
# Save the relfilenode of a set of toast indexes, one from the catalog
# pg_constraint and one from the test table.
diff --git a/src/common/relpath.c b/src/common/relpath.c
index 1b6b620ce83..d0d83e593b5 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -149,10 +149,10 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
Assert(dbOid == 0);
Assert(backendId == InvalidBackendId);
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("global/%u_%s",
+ path = psprintf("global/" UINT64_FORMAT "_%s",
relNumber, forkNames[forkNumber]);
else
- path = psprintf("global/%u", relNumber);
+ path = psprintf("global/" UINT64_FORMAT, relNumber);
}
else if (spcOid == DEFAULTTABLESPACE_OID)
{
@@ -160,21 +160,21 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
if (backendId == InvalidBackendId)
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("base/%u/%u_%s",
+ path = psprintf("base/%u/" UINT64_FORMAT "_%s",
dbOid, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("base/%u/%u",
+ path = psprintf("base/%u/" UINT64_FORMAT,
dbOid, relNumber);
}
else
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("base/%u/t%d_%u_%s",
+ path = psprintf("base/%u/t%d_" UINT64_FORMAT "_%s",
dbOid, backendId, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("base/%u/t%d_%u",
+ path = psprintf("base/%u/t%d_" UINT64_FORMAT,
dbOid, backendId, relNumber);
}
}
@@ -184,24 +184,24 @@ GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber,
if (backendId == InvalidBackendId)
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("pg_tblspc/%u/%s/%u/%u_%s",
+ path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT "_%s",
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("pg_tblspc/%u/%s/%u/%u",
+ path = psprintf("pg_tblspc/%u/%s/%u/" UINT64_FORMAT,
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, relNumber);
}
else
{
if (forkNumber != MAIN_FORKNUM)
- path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u_%s",
+ path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT "_%s",
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, backendId, relNumber,
forkNames[forkNumber]);
else
- path = psprintf("pg_tblspc/%u/%s/%u/t%d_%u",
+ path = psprintf("pg_tblspc/%u/%s/%u/t%d_" UINT64_FORMAT,
spcOid, TABLESPACE_VERSION_DIRECTORY,
dbOid, backendId, relNumber);
}
diff --git a/src/fe_utils/option_utils.c b/src/fe_utils/option_utils.c
index abea88154ca..d4978527b41 100644
--- a/src/fe_utils/option_utils.c
+++ b/src/fe_utils/option_utils.c
@@ -13,6 +13,7 @@
#include "postgres_fe.h"
#include "common/logging.h"
+#include "common/relpath.h"
#include "common/string.h"
#include "fe_utils/option_utils.h"
@@ -82,3 +83,42 @@ option_parse_int(const char *optarg, const char *optname,
*result = val;
return true;
}
+
+/*
+ * option_parse_relfilenumber
+ *
+ * Parse relfilenumber value for an option. If the parsing is successful,
+ * returns; if parsing fails, returns false.
+ */
+bool
+option_parse_relfilenumber(const char *optarg, const char *optname)
+{
+ char *endptr;
+ uint64 val;
+
+ errno = 0;
+ val = strtou64(optarg, &endptr, 10);
+
+ /*
+ * Skip any trailing whitespace; if anything but whitespace remains before
+ * the terminating character, fail.
+ */
+ while (*endptr != '\0' && isspace((unsigned char) *endptr))
+ endptr++;
+
+ if (*endptr != '\0')
+ {
+ pg_log_error("invalid value \"%s\" for option %s",
+ optarg, optname);
+ return false;
+ }
+
+ if (val > MAX_RELFILENUMBER)
+ {
+ pg_log_error("%s must be in range " UINT64_FORMAT ".." UINT64_FORMAT,
+ optname, UINT64CONST(0), MAX_RELFILENUMBER);
+ return false;
+ }
+
+ return true;
+}
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 775471d2a7d..2aaad2b9d51 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -15,6 +15,7 @@
#define TRANSAM_H
#include "access/xlogdefs.h"
+#include "common/relpath.h"
/* ----------------
@@ -196,6 +197,33 @@ FullTransactionIdAdvance(FullTransactionId *dest)
#define FirstUnpinnedObjectId 12000
#define FirstNormalObjectId 16384
+/* ----------
+ * RelFileNumbers are normally assigned sequentially beginning with
+ * FirstNormalRelFileNumber, but for system tables the initial RelFileNumber
+ * is equal to the table OID. This scheme allows pg_upgrade to work: we expect
+ * that the new cluster will contain only system tables, and that none of those
+ * will have previously been rewritten, so any RelFileNumber which is in use
+ * in both the old and new clusters will be used for the same relation in both
+ * places.
+ *
+ * This is important because pg_upgrade can't reactively move conflicting
+ * relations out of the way. If it tries to set the RelFileNumber for a
+ * relation to some value that's already in use by a different relation, the
+ * upgrade will just fail. It's OK if the same RelFileNumber is used for the
+ * same relation, though, since then nothing needs to be changed.
+ * ----------
+ */
+#define FirstNormalRelFileNumber ((RelFileNumber) 100000)
+
+#define CHECK_RELFILENUMBER_RANGE(relfilenumber) \
+do { \
+ if ((relfilenumber) < 0 || (relfilenumber) > MAX_RELFILENUMBER) \
+ ereport(ERROR, \
+ errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
+ errmsg("relfilenumber %llu is out of range", \
+ (unsigned long long) (relfilenumber))); \
+} while (0)
+
/*
* VariableCache is a data structure in shared memory that is used to track
* OID and XID assignment state. For largely historical reasons, there is
@@ -214,6 +242,15 @@ typedef struct VariableCacheData
Oid nextOid; /* next OID to assign */
uint32 oidCount; /* OIDs available before must do XLOG work */
+ /*
+ * These fields are protected by RelFileNumberGenLock.
+ */
+ RelFileNumber nextRelFileNumber; /* next relfilenumber to assign */
+ RelFileNumber loggedRelFileNumber; /* last logged relfilenumber */
+ RelFileNumber flushedRelFileNumber; /* last flushed relfilenumber */
+ XLogRecPtr loggedRelFileNumberRecPtr; /* xlog record pointer w.r.t.
+ * loggedRelFileNumber */
+
/*
* These fields are protected by XidGenLock.
*/
@@ -293,6 +330,9 @@ extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid);
extern bool ForceTransactionIdLimitUpdate(void);
extern Oid GetNewObjectId(void);
+extern RelFileNumber GetNewRelFileNumber(Oid reltablespace,
+ char relpersistence);
+extern void SetNextRelFileNumber(RelFileNumber relnumber);
extern void StopGeneratingPinnedObjectIds(void);
#ifdef USE_ASSERT_CHECKING
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index dce265098e3..53375865dfd 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -236,6 +236,7 @@ extern void CreateCheckPoint(int flags);
extern bool CreateRestartPoint(int flags);
extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN);
extern void XLogPutNextOid(Oid nextOid);
+extern XLogRecPtr LogNextRelFileNumber(RelFileNumber nextrelnumber);
extern XLogRecPtr XLogRestorePoint(const char *rpName);
extern void UpdateFullPageWrites(void);
extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p);
diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h
index e1c85f98550..b45253045e7 100644
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@ -38,8 +38,5 @@ extern bool IsPinnedObject(Oid classId, Oid objectId);
extern Oid GetNewOidWithIndex(Relation relation, Oid indexId,
AttrNumber oidcolumn);
-extern RelFileNumber GetNewRelFileNumber(Oid reltablespace,
- Relation pg_class,
- char relpersistence);
#endif /* CATALOG_H */
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 95e7c249ed8..8ba25e4dc8e 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202209261
+#define CATALOG_VERSION_NO 202209271
#endif
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index e1f4eefa220..4768e5ebda5 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -34,6 +34,13 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat
/* oid */
Oid oid;
+ /* access method; 0 if not a table / index */
+ Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am);
+
+ /* identifier of physical storage file */
+ /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
+ int64 relfilenode BKI_DEFAULT(0);
+
/* class name */
NameData relname;
@@ -49,13 +56,6 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat
/* class owner */
Oid relowner BKI_DEFAULT(POSTGRES) BKI_LOOKUP(pg_authid);
- /* access method; 0 if not a table / index */
- Oid relam BKI_DEFAULT(heap) BKI_LOOKUP_OPT(pg_am);
-
- /* identifier of physical storage file */
- /* relfilenode == 0 means it is a "mapped" relation, see relmapper.c */
- Oid relfilenode BKI_DEFAULT(0);
-
/* identifier of table space for relation (0 means default for database) */
Oid reltablespace BKI_DEFAULT(0) BKI_LOOKUP_OPT(pg_tablespace);
@@ -154,7 +154,7 @@ typedef FormData_pg_class *Form_pg_class;
DECLARE_UNIQUE_INDEX_PKEY(pg_class_oid_index, 2662, ClassOidIndexId, on pg_class using btree(oid oid_ops));
DECLARE_UNIQUE_INDEX(pg_class_relname_nsp_index, 2663, ClassNameNspIndexId, on pg_class using btree(relname name_ops, relnamespace oid_ops));
-DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode oid_ops));
+DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeIndexId, on pg_class using btree(reltablespace oid_ops, relfilenode int8_ops));
#ifdef EXPOSE_TO_CLIENT_CODE
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 06368e23667..096222f1fe5 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -41,6 +41,7 @@ typedef struct CheckPoint
* timeline (equals ThisTimeLineID otherwise) */
bool fullPageWrites; /* current full_page_writes */
FullTransactionId nextXid; /* next free transaction ID */
+ RelFileNumber nextRelFileNumber; /* next relfilenumber */
Oid nextOid; /* next free OID */
MultiXactId nextMulti; /* next free MultiXactId */
MultiXactOffset nextMultiOffset; /* next free MultiXact offset */
@@ -78,6 +79,7 @@ typedef struct CheckPoint
#define XLOG_FPI 0xB0
/* 0xC0 is used in Postgres 9.5-11 */
#define XLOG_OVERWRITE_CONTRECORD 0xD0
+#define XLOG_NEXT_RELFILENUMBER 0xE0
/*
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index a07e737a337..8b72f8a215b 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7329,11 +7329,11 @@
proname => 'pg_indexes_size', provolatile => 'v', prorettype => 'int8',
proargtypes => 'regclass', prosrc => 'pg_indexes_size' },
{ oid => '2999', descr => 'filenode identifier of relation',
- proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'oid',
+ proname => 'pg_relation_filenode', provolatile => 's', prorettype => 'int8',
proargtypes => 'regclass', prosrc => 'pg_relation_filenode' },
{ oid => '3454', descr => 'relation OID for filenode and tablespace',
proname => 'pg_filenode_relation', provolatile => 's',
- prorettype => 'regclass', proargtypes => 'oid oid',
+ prorettype => 'regclass', proargtypes => 'oid int8',
prosrc => 'pg_filenode_relation' },
{ oid => '3034', descr => 'file path of relation',
proname => 'pg_relation_filepath', provolatile => 's', prorettype => 'text',
@@ -11125,15 +11125,15 @@
prosrc => 'binary_upgrade_set_missing_value' },
{ oid => '4545', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_heap_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_heap_relfilenode' },
{ oid => '4546', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_index_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_index_relfilenode' },
{ oid => '4547', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_toast_relfilenode', provolatile => 'v',
- proparallel => 'u', prorettype => 'void', proargtypes => 'oid',
+ proparallel => 'u', prorettype => 'void', proargtypes => 'int8',
prosrc => 'binary_upgrade_set_next_toast_relfilenode' },
{ oid => '4548', descr => 'for use by pg_upgrade',
proname => 'binary_upgrade_set_next_pg_tablespace_oid', provolatile => 'v',
diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h
index 4bbd94393c8..2d3b52fe0b8 100644
--- a/src/include/common/relpath.h
+++ b/src/include/common/relpath.h
@@ -22,10 +22,12 @@
/*
* RelFileNumber data type identifies the specific relation file name.
*/
-typedef Oid RelFileNumber;
-#define InvalidRelFileNumber ((RelFileNumber) InvalidOid)
+typedef uint64 RelFileNumber;
+#define InvalidRelFileNumber ((RelFileNumber) 0)
#define RelFileNumberIsValid(relnumber) \
((bool) ((relnumber) != InvalidRelFileNumber))
+#define atorelnumber(x) ((RelFileNumber) strtou64((x), NULL, 10))
+#define MAX_RELFILENUMBER UINT64CONST(0x00FFFFFFFFFFFFFF)
/*
* Name of major-version-specific tablespace subdirectories
@@ -35,6 +37,7 @@ typedef Oid RelFileNumber;
/* Characters to allow for an OID in a relation path */
#define OIDCHARS 10 /* max chars printed by %u */
+#define RELNUMBERCHARS 20 /* max chars printed by UINT64_FORMAT */
/*
* Stuff for fork names.
diff --git a/src/include/fe_utils/option_utils.h b/src/include/fe_utils/option_utils.h
index 03c09fd13a4..2508a6193b0 100644
--- a/src/include/fe_utils/option_utils.h
+++ b/src/include/fe_utils/option_utils.h
@@ -22,5 +22,7 @@ extern void handle_help_version_opts(int argc, char *argv[],
extern bool option_parse_int(const char *optarg, const char *optname,
int min_range, int max_range,
int *result);
+extern bool option_parse_relfilenumber(const char *optarg,
+ const char *optname);
#endif /* OPTION_UTILS_H */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 406db6be783..c3417b28ba9 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -92,29 +92,66 @@ typedef struct buftag
{
Oid spcOid; /* tablespace oid */
Oid dbOid; /* database oid */
- RelFileNumber relNumber; /* relation file number */
- ForkNumber forkNum; /* fork number */
+
+ /*
+ * relForkDetails[] stores the fork number in the high 8 bits of the first
+ * integer; the remaining 56 bits are used to store the relfilenmber.
+ * Expanding the relfilenumber to a full 64 bits would require widening
+ * the BufferTag, which is undesirable for performance reasons. We use
+ * two 32-bit values here rather than a single 64-bit value to avoid
+ * padding the struct out to a multiple of 8 bytes.
+ */
+ uint32 relForkDetails[2];
BlockNumber blockNum; /* blknum relative to begin of reln */
} BufferTag;
+/* High relNumber bits in relForkDetails[0] */
+#define BUFTAG_RELNUM_HIGH_BITS 24
+
+/* Low relNumber bits in relForkDetails[1] */
+#define BUFTAG_RELNUM_LOW_BITS 32
+
+/* Mask to fetch high bits of relNumber from relForkDetails[0] */
+#define BUFTAG_RELNUM_HIGH_MASK ((1U << BUFTAG_RELNUM_HIGH_BITS) - 1)
+
+/* Mask to fetch low bits of relNumber from relForkDetails[1] */
+#define BUFTAG_RELNUM_LOW_MASK 0XFFFFFFFF
+
static inline RelFileNumber
BufTagGetRelNumber(const BufferTag *tag)
{
- return tag->relNumber;
+ uint64 relnum;
+
+ relnum = ((uint64) tag->relForkDetails[0]) & BUFTAG_RELNUM_HIGH_MASK;
+ relnum = (relnum << BUFTAG_RELNUM_LOW_BITS) | tag->relForkDetails[1];
+
+ Assert(relnum <= MAX_RELFILENUMBER);
+ return (RelFileNumber) relnum;
}
static inline ForkNumber
BufTagGetForkNum(const BufferTag *tag)
{
- return tag->forkNum;
+ ForkNumber ret;
+
+ StaticAssertStmt(MAX_FORKNUM <= INT8_MAX,
+ "MAX_FORKNUM can't be greater than INT8_MAX");
+
+ ret = (int8) (tag->relForkDetails[0] >> BUFTAG_RELNUM_HIGH_BITS);
+ return ret;
}
static inline void
BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber,
ForkNumber forknum)
{
- tag->relNumber = relnumber;
- tag->forkNum = forknum;
+ Assert(relnumber <= MAX_RELFILENUMBER);
+ Assert(forknum <= MAX_FORKNUM);
+
+ tag->relForkDetails[0] = (relnumber >> BUFTAG_RELNUM_LOW_BITS) &
+ BUFTAG_RELNUM_HIGH_MASK;
+ tag->relForkDetails[0] |= (forknum << BUFTAG_RELNUM_HIGH_BITS);
+ tag->relForkDetails[1] = relnumber & BUFTAG_RELNUM_LOW_MASK;
}
static inline RelFileLocator
@@ -153,9 +190,9 @@ BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
{
return (tag1->spcOid == tag2->spcOid) &&
(tag1->dbOid == tag2->dbOid) &&
- (tag1->relNumber == tag2->relNumber) &&
- (tag1->blockNum == tag2->blockNum) &&
- (tag1->forkNum == tag2->forkNum);
+ (tag1->relForkDetails[0] == tag2->relForkDetails[0]) &&
+ (tag1->relForkDetails[1] == tag2->relForkDetails[1]) &&
+ (tag1->blockNum == tag2->blockNum);
}
static inline bool
diff --git a/src/include/storage/relfilelocator.h b/src/include/storage/relfilelocator.h
index 10f41f3abb3..ef904644fa4 100644
--- a/src/include/storage/relfilelocator.h
+++ b/src/include/storage/relfilelocator.h
@@ -32,10 +32,11 @@
* Nonzero dbOid values correspond to pg_database.oid.
*
* relNumber identifies the specific relation. relNumber corresponds to
- * pg_class.relfilenode (NOT pg_class.oid, because we need to be able
- * to assign new physical files to relations in some situations).
- * Notice that relNumber is only unique within a database in a particular
- * tablespace.
+ * pg_class.relfilenode. Notice that relNumber values are assigned by
+ * GetNewRelFileNumber(), which will only ever assign the same value once
+ * during the lifetime of a cluster. However, since CREATE DATABASE duplicates
+ * the relfilenumbers of the template database, the values are in practice only
+ * unique within a database, not globally.
*
* Note: spcOid must be GLOBALTABLESPACE_OID if and only if dbOid is
* zero. We support shared relations only in the "global" tablespace.
@@ -75,6 +76,9 @@ typedef struct RelFileLocatorBackend
BackendId backend;
} RelFileLocatorBackend;
+#define SizeOfRelFileLocatorBackend \
+ (offsetof(RelFileLocatorBackend, backend) + sizeof(BackendId))
+
#define RelFileLocatorBackendIsTemp(rlocator) \
((rlocator).backend != InvalidBackendId)
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 346f594ad02..86666b83ae5 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -2164,9 +2164,8 @@ select relname,
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
@@ -2175,10 +2174,10 @@ select relname,
relname | orig_oid | storage | desc
------------------------------+----------+---------+---------------
at_partitioned | t | none |
- at_partitioned_0 | t | own |
- at_partitioned_0_id_name_key | t | own | child 0 index
- at_partitioned_1 | t | own |
- at_partitioned_1_id_name_key | t | own | child 1 index
+ at_partitioned_0 | t | orig |
+ at_partitioned_0_id_name_key | t | orig | child 0 index
+ at_partitioned_1 | t | orig |
+ at_partitioned_1_id_name_key | t | orig | child 1 index
at_partitioned_id_name_key | t | none | parent index
(6 rows)
@@ -2198,9 +2197,8 @@ select relname,
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
@@ -2209,10 +2207,10 @@ select relname,
relname | orig_oid | storage | desc
------------------------------+----------+---------+--------------
at_partitioned | t | none |
- at_partitioned_0 | t | own |
- at_partitioned_0_id_name_key | f | own | parent index
- at_partitioned_1 | t | own |
- at_partitioned_1_id_name_key | f | own | parent index
+ at_partitioned_0 | t | orig |
+ at_partitioned_0_id_name_key | f | new | parent index
+ at_partitioned_1 | t | orig |
+ at_partitioned_1_id_name_key | f | new | parent index
at_partitioned_id_name_key | f | none | parent index
(6 rows)
@@ -2560,7 +2558,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text)
RETURNS boolean
LANGUAGE plpgsql AS $$
DECLARE
- v_relfilenode oid;
+ v_relfilenode int8;
BEGIN
v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out
index 91f25717b5a..0a35f333f63 100644
--- a/src/test/regress/expected/fast_default.out
+++ b/src/test/regress/expected/fast_default.out
@@ -3,8 +3,8 @@
--
SET search_path = fast_default;
CREATE SCHEMA fast_default;
-CREATE TABLE m(id OID);
-INSERT INTO m VALUES (NULL::OID);
+CREATE TABLE m(id BIGINT);
+INSERT INTO m VALUES (NULL::BIGINT);
CREATE FUNCTION set(tabname name) RETURNS VOID
AS $$
BEGIN
diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out
index 215eb899be3..af57470f93e 100644
--- a/src/test/regress/expected/oidjoins.out
+++ b/src/test/regress/expected/oidjoins.out
@@ -74,11 +74,11 @@ NOTICE: checking pg_type {typcollation} => pg_collation {oid}
NOTICE: checking pg_attribute {attrelid} => pg_class {oid}
NOTICE: checking pg_attribute {atttypid} => pg_type {oid}
NOTICE: checking pg_attribute {attcollation} => pg_collation {oid}
+NOTICE: checking pg_class {relam} => pg_am {oid}
NOTICE: checking pg_class {relnamespace} => pg_namespace {oid}
NOTICE: checking pg_class {reltype} => pg_type {oid}
NOTICE: checking pg_class {reloftype} => pg_type {oid}
NOTICE: checking pg_class {relowner} => pg_authid {oid}
-NOTICE: checking pg_class {relam} => pg_am {oid}
NOTICE: checking pg_class {reltablespace} => pg_tablespace {oid}
NOTICE: checking pg_class {reltoastrelid} => pg_class {oid}
NOTICE: checking pg_class {relrewrite} => pg_class {oid}
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 9f773aeeb94..a67eb5f982a 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1478,9 +1478,8 @@ select relname,
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
@@ -1499,9 +1498,8 @@ select relname,
c.oid = oldoid as orig_oid,
case relfilenode
when 0 then 'none'
- when c.oid then 'own'
when oldfilenode then 'orig'
- else 'OTHER'
+ else 'new'
end as storage,
obj_description(c.oid, 'pg_class') as desc
from pg_class c left join old_oids using (relname)
@@ -1641,7 +1639,7 @@ CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text)
RETURNS boolean
LANGUAGE plpgsql AS $$
DECLARE
- v_relfilenode oid;
+ v_relfilenode int8;
BEGIN
v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename;
diff --git a/src/test/regress/sql/fast_default.sql b/src/test/regress/sql/fast_default.sql
index 16a3b7ca51d..819ec40fdaf 100644
--- a/src/test/regress/sql/fast_default.sql
+++ b/src/test/regress/sql/fast_default.sql
@@ -4,8 +4,8 @@
SET search_path = fast_default;
CREATE SCHEMA fast_default;
-CREATE TABLE m(id OID);
-INSERT INTO m VALUES (NULL::OID);
+CREATE TABLE m(id BIGINT);
+INSERT INTO m VALUES (NULL::BIGINT);
CREATE FUNCTION set(tabname name) RETURNS VOID
AS $$