mirror of https://github.com/postgres/postgres
free space information is stored in a dedicated FSM relation fork, with each relation (except for hash indexes; they don't use FSM). This eliminates the max_fsm_relations and max_fsm_pages GUC options; remove any trace of them from the backend, initdb, and documentation. Rewrite contrib/pg_freespacemap to match the new FSM implementation. Also introduce a new variant of the get_raw_page(regclass, int4, int4) function in contrib/pageinspect that let's you to return pages from any relation fork, and a new fsm_page_contents() function to inspect the new FSM pages.REL8_5_ALPHA1_BRANCH
parent
2dbc0ca937
commit
15c121b3ed
@ -0,0 +1,61 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* fsmfuncs.c |
||||
* Functions to investigate FSM pages |
||||
* |
||||
* These functions are restricted to superusers for the fear of introducing |
||||
* security holes if the input checking isn't as water-tight as it should. |
||||
* You'd need to be superuser to obtain a raw page image anyway, so |
||||
* there's hardly any use case for using these without superuser-rights |
||||
* anyway. |
||||
* |
||||
* Copyright (c) 2007-2008, PostgreSQL Global Development Group |
||||
* |
||||
* IDENTIFICATION |
||||
* $PostgreSQL: pgsql/contrib/pageinspect/fsmfuncs.c,v 1.1 2008/09/30 10:52:09 heikki Exp $ |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
|
||||
#include "postgres.h" |
||||
#include "lib/stringinfo.h" |
||||
#include "storage/fsm_internals.h" |
||||
#include "utils/builtins.h" |
||||
#include "miscadmin.h" |
||||
#include "funcapi.h" |
||||
|
||||
Datum fsm_page_contents(PG_FUNCTION_ARGS); |
||||
|
||||
/*
|
||||
* Dumps the contents of a FSM page. |
||||
*/ |
||||
PG_FUNCTION_INFO_V1(fsm_page_contents); |
||||
|
||||
Datum |
||||
fsm_page_contents(PG_FUNCTION_ARGS) |
||||
{ |
||||
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||
int raw_page_size; |
||||
StringInfoData sinfo; |
||||
FSMPage fsmpage; |
||||
int i; |
||||
|
||||
if (!superuser()) |
||||
ereport(ERROR, |
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), |
||||
(errmsg("must be superuser to use raw page functions")))); |
||||
|
||||
raw_page_size = VARSIZE(raw_page) - VARHDRSZ; |
||||
fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page)); |
||||
|
||||
initStringInfo(&sinfo); |
||||
|
||||
for(i=0; i < NodesPerPage; i++) |
||||
{ |
||||
if (fsmpage->fp_nodes[i] != 0) |
||||
appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]); |
||||
} |
||||
appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot); |
||||
|
||||
PG_RETURN_TEXT_P(cstring_to_text(sinfo.data)); |
||||
} |
@ -1,44 +1,26 @@ |
||||
/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.8 2007/11/13 04:24:28 momjian Exp $ */ |
||||
/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.9 2008/09/30 10:52:09 heikki Exp $ */ |
||||
|
||||
-- Adjust this setting to control where the objects get created. |
||||
SET search_path = public; |
||||
|
||||
|
||||
-- Register the functions. |
||||
CREATE OR REPLACE FUNCTION pg_freespacemap_pages() |
||||
RETURNS SETOF RECORD |
||||
AS 'MODULE_PATHNAME', 'pg_freespacemap_pages' |
||||
-- Register the C function. |
||||
CREATE OR REPLACE FUNCTION pg_freespace(regclass, int4) |
||||
RETURNS int2 |
||||
AS 'MODULE_PATHNAME', 'pg_freespace' |
||||
LANGUAGE C; |
||||
|
||||
CREATE OR REPLACE FUNCTION pg_freespacemap_relations() |
||||
-- pg_freespace shows the recorded space avail at each block in a relation |
||||
CREATE OR REPLACE FUNCTION |
||||
pg_freespace(rel regclass, blkno OUT int4, avail OUT int2) |
||||
RETURNS SETOF RECORD |
||||
AS 'MODULE_PATHNAME', 'pg_freespacemap_relations' |
||||
LANGUAGE C; |
||||
AS $$ |
||||
SELECT blkno::int4, pg_freespace($1, blkno::int4) AS avail |
||||
FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno; |
||||
$$ |
||||
LANGUAGE SQL; |
||||
|
||||
|
||||
-- Create views for convenient access. |
||||
CREATE VIEW pg_freespacemap_pages AS |
||||
SELECT P.* FROM pg_freespacemap_pages() AS P |
||||
(reltablespace oid, |
||||
reldatabase oid, |
||||
relfilenode oid, |
||||
relblocknumber bigint, |
||||
bytes integer); |
||||
|
||||
CREATE VIEW pg_freespacemap_relations AS |
||||
SELECT P.* FROM pg_freespacemap_relations() AS P |
||||
(reltablespace oid, |
||||
reldatabase oid, |
||||
relfilenode oid, |
||||
avgrequest integer, |
||||
interestingpages integer, |
||||
storedpages integer, |
||||
nextpage integer); |
||||
|
||||
|
||||
-- Don't want these to be available to public. |
||||
REVOKE ALL ON FUNCTION pg_freespacemap_pages() FROM PUBLIC; |
||||
REVOKE ALL ON pg_freespacemap_pages FROM PUBLIC; |
||||
|
||||
REVOKE ALL ON FUNCTION pg_freespacemap_relations() FROM PUBLIC; |
||||
REVOKE ALL ON pg_freespacemap_relations FROM PUBLIC; |
||||
REVOKE ALL ON FUNCTION pg_freespace(regclass, int4) FROM PUBLIC; |
||||
REVOKE ALL ON FUNCTION pg_freespace(regclass) FROM PUBLIC; |
||||
|
@ -0,0 +1,195 @@ |
||||
$PostgreSQL: pgsql/src/backend/storage/freespace/README,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||
|
||||
Free Space Map |
||||
-------------- |
||||
|
||||
The purpose of the free space map is to quickly locate a page with enough |
||||
free space to hold a tuple to be stored; or to determine that no such page |
||||
exists and the relation must be extended by one page. As of PostgreSQL 8.4 |
||||
each relation has its own, extensible free space map stored in a separate |
||||
"fork" of its relation. This eliminates the disadvantages of the former |
||||
fixed-size FSM. |
||||
|
||||
It is important to keep the map small so that it can be searched rapidly. |
||||
Therefore, we don't attempt to record the exact free space on a page. |
||||
We allocate one map byte to each page, allowing us to record free space |
||||
at a granularity of 1/256th of a page. Another way to say it is that |
||||
the stored value is the free space divided by BLCKSZ/256 (rounding down). |
||||
We assume that the free space must always be less than BLCKSZ, since |
||||
all pages have some overhead; so the maximum map value is 255. |
||||
|
||||
To assist in fast searching, the map isn't simply an array of per-page |
||||
entries, but has a tree structure above those entries. There is a tree |
||||
structure of pages, and a tree structure within each page, as described |
||||
below. |
||||
|
||||
FSM page structure |
||||
------------------ |
||||
|
||||
Within each FSM page, we use a binary tree structure where leaf nodes store |
||||
the amount of free space on heap pages (or lower level FSM pages, see |
||||
"Higher-level structure" below), with one leaf node per heap page. A non-leaf |
||||
node stores the max amount of free space on any of its children. |
||||
|
||||
For example: |
||||
|
||||
4 |
||||
4 2 |
||||
3 4 0 2 <- This level represents heap pages |
||||
|
||||
We need two basic operations: search and update. |
||||
|
||||
To search for a page with X amount of free space, traverse down the tree |
||||
along a path where n >= X, until you hit the bottom. If both children of a |
||||
node satisfy the condition, you can pick either one arbitrarily. |
||||
|
||||
To update the amount of free space on a page to X, first update the leaf node |
||||
corresponding to the heap page, then "bubble up" the change to upper nodes, |
||||
by walking up to each parent and recomputing its value as the max of its |
||||
two children. Repeat until reaching the root or a parent whose value |
||||
doesn't change. |
||||
|
||||
This data structure has a couple of nice properties: |
||||
- to discover that there is no page with X bytes of free space, you only |
||||
need to look at the root node |
||||
- by varying which child to traverse to in the search algorithm, when you have |
||||
a choice, we can implement various strategies, like preferring pages closer |
||||
to a given page, or spreading the load across the table. |
||||
|
||||
Higher-level routines that use FSM pages access them through the fsm_set_avail() |
||||
and fsm_search_avail() functions. The interface to those functions hides the |
||||
page's internal tree structure, treating the FSM page as a black box that has |
||||
a certain number of "slots" for storing free space information. (However, |
||||
the higher routines have to be aware of the tree structure of the whole map.) |
||||
|
||||
The binary tree is stored on each FSM page as an array. Because the page |
||||
header takes some space on a page, the binary tree isn't perfect. That is, |
||||
a few right-most leaf nodes are missing, and there are some useless non-leaf |
||||
nodes at the right. So the tree looks something like this: |
||||
|
||||
0 |
||||
1 2 |
||||
3 4 5 6 |
||||
7 8 9 A B |
||||
|
||||
where the numbers denote each node's position in the array. Note that the |
||||
tree is guaranteed complete above the leaf level; only some leaf nodes are |
||||
missing. This is reflected in the number of usable "slots" per page not |
||||
being an exact power of 2. |
||||
|
||||
A FSM page also has a next slot pointer, fp_next_slot, that determines where |
||||
to start the next search for free space within that page. The reason for that |
||||
is to spread out the pages that are returned by FSM searches. When several |
||||
backends are concurrently inserting into a relation, contention can be avoided |
||||
by having them insert into different pages. But it is also desirable to fill |
||||
up pages in sequential order, to get the benefit of OS prefetching and batched |
||||
writes. The FSM is responsible for making that happen, and the next slot |
||||
pointer helps provide the desired behavior. |
||||
|
||||
Higher-level structure |
||||
---------------------- |
||||
|
||||
To scale up the data structure described above beyond a single page, we |
||||
maintain a similar tree-structure across pages. Leaf nodes in higher level |
||||
pages correspond to lower level FSM pages. The root node within each page |
||||
has the same value as the corresponding leaf node on its parent page. |
||||
|
||||
The root page is always stored at physical block 0. |
||||
|
||||
For example, assuming each FSM page can hold information about 4 pages (in |
||||
reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ), |
||||
we get a disk layout like this: |
||||
|
||||
0 <-- page 0 at level 2 (root page) |
||||
0 <-- page 0 at level 1 |
||||
0 <-- page 0 at level 0 |
||||
1 <-- page 1 at level 0 |
||||
2 <-- ... |
||||
3 |
||||
1 <-- page 1 at level 1 |
||||
4 |
||||
5 |
||||
6 |
||||
7 |
||||
2 |
||||
8 |
||||
9 |
||||
10 |
||||
11 |
||||
3 |
||||
12 |
||||
13 |
||||
14 |
||||
15 |
||||
|
||||
where the numbers are page numbers *at that level*, starting from 0. |
||||
|
||||
To find the physical block # corresponding to leaf page n, we need to |
||||
count the number number of leaf and upper-level pages preceding page n. |
||||
This turns out to be |
||||
|
||||
y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1 |
||||
|
||||
where F is the fanout (4 in the above example). The first term n is the number |
||||
of preceding leaf pages, the second term is the number of pages at level 1, |
||||
and so forth. |
||||
|
||||
To keep things simple, the tree is always constant height. To cover the |
||||
maximum relation size of 2^32-1 blocks, three levels is enough with the default |
||||
BLCKSZ (4000^3 > 2^32). |
||||
|
||||
Addressing |
||||
---------- |
||||
|
||||
The higher-level routines operate on "logical" addresses, consisting of |
||||
- level, |
||||
- logical page number, and |
||||
- slot (if applicable) |
||||
|
||||
Bottom level FSM pages have level of 0, the level above that 1, and root 2. |
||||
As in the diagram above, logical page number is the page number at that level, |
||||
starting from 0. |
||||
|
||||
Locking |
||||
------- |
||||
|
||||
When traversing down to search for free space, only one page is locked at a |
||||
time: the parent page is released before locking the child. If the child page |
||||
is concurrently modified, and there no longer is free space on the child page |
||||
when you land on it, you need to start from scratch (after correcting the |
||||
parent page, so that you don't get into an infinite loop). |
||||
|
||||
We use shared buffer locks when searching, but exclusive buffer lock when |
||||
updating a page. However, the next slot search pointer is updated during |
||||
searches even though we have only a shared lock. fp_next_slot is just a hint |
||||
and we can easily reset it if it gets corrupted; so it seems better to accept |
||||
some risk of that type than to pay the overhead of exclusive locking. |
||||
|
||||
Recovery |
||||
-------- |
||||
|
||||
The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of |
||||
self-correcting measures to repair possible corruption. |
||||
|
||||
First of all, whenever a value is set on an FSM page, the root node of the |
||||
page is compared against the new value after bubbling up the change is |
||||
finished. It should be greater than or equal to the value just set, or we |
||||
have a corrupted page, with a parent somewhere with too small a value. |
||||
Secondly, if we detect corrupted pages while we search, traversing down |
||||
the tree. That check will notice if a parent node is set to too high a value. |
||||
In both cases, the upper nodes on the page are immediately rebuilt, fixing |
||||
the corruption. |
||||
|
||||
Vacuum updates all the bottom level pages with correct amount of free space |
||||
on the heap pages, fixing any outdated values there. After the heap and |
||||
index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is |
||||
scanned in depth-first order. This fixes any discrepancies between upper |
||||
and lower level FSM pages. |
||||
|
||||
TODO |
||||
---- |
||||
|
||||
- fastroot to avoid traversing upper nodes with just 1 child |
||||
- use a different system for tables that fit into one FSM page, with a |
||||
mechanism to switch to the real thing as it grows. |
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,352 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* fsmpage.c |
||||
* routines to search and manipulate one FSM page. |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* $PostgreSQL: pgsql/src/backend/storage/freespace/fsmpage.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||
* |
||||
* NOTES: |
||||
* |
||||
* The public functions in this file form an API that hides the internal |
||||
* structure of a FSM page. This allows freespace.c to treat each FSM page |
||||
* as a black box with SlotsPerPage "slots". fsm_set_avail() and |
||||
* fsm_get_avail() let's you get/set the value of a slot, and |
||||
* fsm_search_avail() let's you search for a slot with value >= X. |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "storage/bufmgr.h" |
||||
#include "storage/fsm_internals.h" |
||||
|
||||
/* macros to navigate the tree within a page. */ |
||||
#define leftchild(x) (2 * (x) + 1) |
||||
#define rightchild(x) (2 * (x) + 2) |
||||
#define parentof(x) (((x) - 1) / 2) |
||||
|
||||
/* returns right sibling of x, wrapping around within the level */ |
||||
static int |
||||
rightsibling(int x) |
||||
{ |
||||
/*
|
||||
* Move right. This might wrap around, stepping to the leftmost node at |
||||
* the next level. |
||||
*/ |
||||
x++; |
||||
|
||||
/*
|
||||
* Check if we stepped to the leftmost node at next level, and correct |
||||
* if so. The leftmost nodes at each level are of form x = 2^level - 1, so |
||||
* check if (x + 1) is a power of two. |
||||
*/ |
||||
if (((x + 1) & x) == 0) |
||||
x = parentof(x); |
||||
|
||||
return x; |
||||
} |
||||
|
||||
/*
|
||||
* Sets the value of a slot on page. Returns true if the page was |
||||
* modified. |
||||
* |
||||
* The caller must hold an exclusive lock on the page. |
||||
*/ |
||||
bool |
||||
fsm_set_avail(Page page, int slot, uint8 value) |
||||
{ |
||||
int nodeno = NonLeafNodesPerPage + slot; |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
uint8 oldvalue; |
||||
|
||||
Assert(slot < LeafNodesPerPage); |
||||
|
||||
oldvalue = fsmpage->fp_nodes[nodeno]; |
||||
|
||||
/* If the value hasn't changed, we don't need to do anything */ |
||||
if (oldvalue == value && value <= fsmpage->fp_nodes[0]) |
||||
return false; |
||||
|
||||
fsmpage->fp_nodes[nodeno] = value; |
||||
|
||||
/*
|
||||
* Propagate up, until we hit the root or a node that doesn't |
||||
* need to be updated. |
||||
*/ |
||||
do |
||||
{ |
||||
uint8 newvalue = 0; |
||||
int lchild; |
||||
int rchild; |
||||
|
||||
nodeno = parentof(nodeno); |
||||
lchild = leftchild(nodeno); |
||||
rchild = lchild + 1; |
||||
|
||||
newvalue = fsmpage->fp_nodes[lchild]; |
||||
if (rchild < NodesPerPage) |
||||
newvalue = Max(newvalue, |
||||
fsmpage->fp_nodes[rchild]); |
||||
|
||||
oldvalue = fsmpage->fp_nodes[nodeno]; |
||||
if (oldvalue == newvalue) |
||||
break; |
||||
|
||||
fsmpage->fp_nodes[nodeno] = newvalue; |
||||
} while (nodeno > 0); |
||||
|
||||
/*
|
||||
* sanity check: if the new value value is higher than the value |
||||
* at the top, the tree is corrupt. |
||||
*/ |
||||
if (value > fsmpage->fp_nodes[0]) |
||||
fsm_rebuild_page(page); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
/*
|
||||
* Returns the value of given slot on page. |
||||
* |
||||
* Since this is just a read-only access of a single byte, the page doesn't |
||||
* need to be locked. |
||||
*/ |
||||
uint8 |
||||
fsm_get_avail(Page page, int slot) |
||||
{ |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
|
||||
return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; |
||||
} |
||||
|
||||
/*
|
||||
* Returns the value at the root of a page. |
||||
* Since this is just a read-only access of a single byte, the page doesn't |
||||
* need to be locked. |
||||
*/ |
||||
uint8 |
||||
fsm_get_max_avail(Page page) |
||||
{ |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
return fsmpage->fp_nodes[0]; |
||||
} |
||||
|
||||
/*
|
||||
* Searches for a slot with min. category. Returns slot number, or -1 if
|
||||
* none found. |
||||
* |
||||
* The caller must hold at least a shared lock on the page, and this |
||||
* function can unlock and lock the page again in exclusive mode if it |
||||
* needs to be updated. exclusive_lock_held should be set to true if the |
||||
* caller is already holding an exclusive lock, to avoid extra work. |
||||
* |
||||
* If advancenext is false, fp_next_slot is set to point to the returned |
||||
* slot, and if it's true, to the slot next to the returned slot. |
||||
*/ |
||||
int |
||||
fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, |
||||
bool exclusive_lock_held) |
||||
{ |
||||
Page page = BufferGetPage(buf); |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
int nodeno; |
||||
int target; |
||||
uint16 slot; |
||||
|
||||
restart: |
||||
/*
|
||||
* Check the root first, and exit quickly if there's no page with |
||||
* enough free space |
||||
*/ |
||||
if (fsmpage->fp_nodes[0] < minvalue) |
||||
return -1; |
||||
|
||||
|
||||
/* fp_next_slot is just a hint, so check that it's sane */ |
||||
target = fsmpage->fp_next_slot; |
||||
if (target < 0 || target >= LeafNodesPerPage) |
||||
target = 0; |
||||
target += NonLeafNodesPerPage; |
||||
|
||||
/*
|
||||
* Start the search from the target slot. At every step, move one |
||||
* node to the right, and climb up to the parent. Stop when we reach a |
||||
* node with enough free space. (note that moving to the right only |
||||
* makes a difference if we're on the right child of the parent) |
||||
* |
||||
* The idea is to graduall expand our "search triangle", that is, all |
||||
* nodes covered by the current node. In the beginning, just the target |
||||
* node is included, and more nodes to the right of the target node, |
||||
* taking wrap-around into account, is included at each step. Nodes are |
||||
* added to the search triangle in left-to-right order, starting from |
||||
* the target node. This ensures that we'll find the first suitable node |
||||
* to the right of the target node, and not some other node with enough |
||||
* free space. |
||||
* |
||||
* For example, consider this tree: |
||||
* |
||||
* 7 |
||||
* 7 6 |
||||
* 5 7 6 5 |
||||
* 4 5 5 7 2 6 5 2 |
||||
* T |
||||
* |
||||
* Imagine that target node is the node indicated by the letter T, and |
||||
* we're searching for a node with value of 6 or higher. The search |
||||
* begins at T. At first iteration, we move to the right, and to the |
||||
* parent, arriving the rightmost 5. At the 2nd iteration, we move to the |
||||
* right, wrapping around, and climb up, arriving at the 7 at the 2nd |
||||
* level. 7 satisfies our search, so we descend down to the bottom, |
||||
* following the path of sevens. |
||||
*/ |
||||
nodeno = target; |
||||
while (nodeno > 0) |
||||
{ |
||||
if (fsmpage->fp_nodes[nodeno] >= minvalue) |
||||
break; |
||||
|
||||
/*
|
||||
* Move to the right, wrapping around at the level if necessary, and |
||||
* climb up. |
||||
*/ |
||||
nodeno = parentof(rightsibling(nodeno)); |
||||
} |
||||
|
||||
/*
|
||||
* We're now at a node with enough free space, somewhere in the middle of |
||||
* the tree. Descend to the bottom, following a path with enough free |
||||
* space, preferring to move left if there's a choice. |
||||
*/ |
||||
while (nodeno < NonLeafNodesPerPage) |
||||
{ |
||||
int leftnodeno = leftchild(nodeno); |
||||
int rightnodeno = leftnodeno + 1; |
||||
bool leftok = (leftnodeno < NodesPerPage) && |
||||
(fsmpage->fp_nodes[leftnodeno] >= minvalue); |
||||
bool rightok = (rightnodeno < NodesPerPage) && |
||||
(fsmpage->fp_nodes[rightnodeno] >= minvalue); |
||||
|
||||
if (leftok) |
||||
nodeno = leftnodeno; |
||||
else if (rightok) |
||||
nodeno = rightnodeno; |
||||
else |
||||
{ |
||||
/*
|
||||
* Oops. The parent node promised that either left or right |
||||
* child has enough space, but neither actually did. This can |
||||
* happen in case of a "torn page", IOW if we crashed earlier |
||||
* while writing the page to disk, and only part of the page |
||||
* made it to disk. |
||||
* |
||||
* Fix the corruption and restart. |
||||
*/ |
||||
RelFileNode rnode; |
||||
ForkNumber forknum; |
||||
BlockNumber blknum; |
||||
|
||||
BufferGetTag(buf, &rnode, &forknum, &blknum); |
||||
elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", |
||||
blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); |
||||
|
||||
/* make sure we hold an exclusive lock */ |
||||
if (!exclusive_lock_held) |
||||
{ |
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
||||
exclusive_lock_held = true; |
||||
} |
||||
fsm_rebuild_page(page); |
||||
MarkBufferDirty(buf); |
||||
goto restart; |
||||
} |
||||
} |
||||
|
||||
/* We're now at the bottom level, at a node with enough space. */ |
||||
slot = nodeno - NonLeafNodesPerPage; |
||||
|
||||
/*
|
||||
* Update the next slot pointer. Note that we do this even if we're only |
||||
* holding a shared lock, on the grounds that it's better to use a shared |
||||
* lock and get a garbled next pointer every now and then, than take the |
||||
* concurrency hit of an exlusive lock. |
||||
* |
||||
* Wrap-around is handled at the beginning of this function. |
||||
*/ |
||||
fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); |
||||
|
||||
return slot; |
||||
} |
||||
|
||||
/*
|
||||
* Sets the available space to zero for all slots numbered >= nslots. |
||||
* Returns true if the page was modified. |
||||
*/ |
||||
bool |
||||
fsm_truncate_avail(Page page, int nslots) |
||||
{ |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
uint8 *ptr; |
||||
bool changed = false; |
||||
|
||||
Assert(nslots >= 0 && nslots < LeafNodesPerPage); |
||||
|
||||
/* Clear all truncated leaf nodes */ |
||||
ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; |
||||
for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) |
||||
{ |
||||
if (*ptr != 0) |
||||
changed = true; |
||||
*ptr = 0; |
||||
} |
||||
|
||||
/* Fix upper nodes. */ |
||||
if (changed) |
||||
fsm_rebuild_page(page); |
||||
|
||||
return changed; |
||||
} |
||||
|
||||
/*
|
||||
* Reconstructs the upper levels of a page. Returns true if the page |
||||
* was modified. |
||||
*/ |
||||
bool |
||||
fsm_rebuild_page(Page page) |
||||
{ |
||||
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||
bool changed = false; |
||||
int nodeno; |
||||
|
||||
/*
|
||||
* Start from the lowest non-leaflevel, at last node, working our way |
||||
* backwards, through all non-leaf nodes at all levels, up to the root. |
||||
*/ |
||||
for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) |
||||
{ |
||||
int lchild = leftchild(nodeno); |
||||
int rchild = lchild + 1; |
||||
uint8 newvalue = 0; |
||||
|
||||
if (lchild < NodesPerPage) |
||||
newvalue = fsmpage->fp_nodes[lchild]; |
||||
|
||||
if (rchild < NodesPerPage) |
||||
newvalue = Max(newvalue, |
||||
fsmpage->fp_nodes[rchild]); |
||||
|
||||
if (fsmpage->fp_nodes[nodeno] != newvalue) |
||||
{ |
||||
fsmpage->fp_nodes[nodeno] = newvalue; |
||||
changed = true; |
||||
} |
||||
} |
||||
|
||||
return changed; |
||||
} |
||||
|
@ -0,0 +1,92 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* indexfsm.c |
||||
* POSTGRES free space map for quickly finding free pages in relations |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* IDENTIFICATION |
||||
* $PostgreSQL: pgsql/src/backend/storage/freespace/indexfsm.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||
* |
||||
* |
||||
* NOTES: |
||||
* |
||||
* This is similar to the FSM used for heap, in freespace.c, but instead |
||||
* of tracking the amount of free space on pages, we only track whether |
||||
* pages are completely free or in-use. We use the same FSM implementation |
||||
* as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused. |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "storage/freespace.h" |
||||
#include "storage/indexfsm.h" |
||||
#include "storage/smgr.h" |
||||
|
||||
/*
|
||||
* Exported routines |
||||
*/ |
||||
|
||||
/*
|
||||
* InitIndexFreeSpaceMap - Create or reset the FSM fork for relation. |
||||
*/ |
||||
void |
||||
InitIndexFreeSpaceMap(Relation rel) |
||||
{ |
||||
/* Create FSM fork if it doesn't exist yet, or truncate it if it does */ |
||||
RelationOpenSmgr(rel); |
||||
if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) |
||||
smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false); |
||||
else |
||||
smgrtruncate(rel->rd_smgr, FSM_FORKNUM, 0, rel->rd_istemp); |
||||
} |
||||
|
||||
/*
|
||||
* GetFreeIndexPage - return a free page from the FSM |
||||
* |
||||
* As a side effect, the page is marked as used in the FSM. |
||||
*/ |
||||
BlockNumber |
||||
GetFreeIndexPage(Relation rel) |
||||
{ |
||||
BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ/2); |
||||
|
||||
if (blkno != InvalidBlockNumber) |
||||
RecordUsedIndexPage(rel, blkno); |
||||
|
||||
return blkno; |
||||
} |
||||
|
||||
/*
|
||||
* RecordFreeIndexPage - mark a page as free in the FSM |
||||
*/ |
||||
void |
||||
RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) |
||||
{ |
||||
RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); |
||||
} |
||||
|
||||
|
||||
/*
|
||||
* RecordUsedIndexPage - mark a page as used in the FSM |
||||
*/ |
||||
void |
||||
RecordUsedIndexPage(Relation rel, BlockNumber usedBlock) |
||||
{ |
||||
RecordPageWithFreeSpace(rel, usedBlock, 0); |
||||
} |
||||
|
||||
/*
|
||||
* IndexFreeSpaceMapTruncate - adjust for truncation of a relation. |
||||
* |
||||
* We need to delete any stored data past the new relation length, so that |
||||
* we don't bogusly return removed block numbers. |
||||
*/ |
||||
void |
||||
IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks) |
||||
{ |
||||
FreeSpaceMapTruncateRel(rel, nblocks); |
||||
} |
@ -0,0 +1,73 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* fsm_internal.h |
||||
* internal functions for free space map |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* $PostgreSQL: pgsql/src/include/storage/fsm_internals.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef FSM_INTERNALS_H |
||||
#define FSM_INTERNALS_H |
||||
|
||||
#include "storage/buf.h" |
||||
#include "storage/bufpage.h" |
||||
#include "lib/stringinfo.h" |
||||
|
||||
/*
|
||||
* Structure of a FSM page. See src/backend/storage/freespace/README for |
||||
* details. |
||||
*/ |
||||
typedef struct |
||||
{ |
||||
/*
|
||||
* fsm_search_avail() tries to spread the load of multiple backends |
||||
* by returning different pages to different backends in a round-robin |
||||
* fashion. fp_next_slot points to the next slot to be returned |
||||
* (assuming there's enough space on it for the request). It's defined |
||||
* as an int, because it's updated without an exclusive lock. uint16 |
||||
* would be more appropriate, but int is more likely to be atomically |
||||
* fetchable/storable. |
||||
*/ |
||||
int fp_next_slot; |
||||
|
||||
/*
|
||||
* fp_nodes contains the binary tree, stored in array. The first |
||||
* NonLeafNodesPerPage elements are upper nodes, and the following |
||||
* LeafNodesPerPage elements are leaf nodes. Unused nodes are zero. |
||||
*/ |
||||
uint8 fp_nodes[1]; |
||||
} FSMPageData; |
||||
|
||||
typedef FSMPageData *FSMPage; |
||||
|
||||
/*
|
||||
* Number of non-leaf and leaf nodes, and nodes in total, on an FSM page. |
||||
* These definitions are internal to fsmpage.c. |
||||
*/ |
||||
#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ |
||||
offsetof(FSMPageData, fp_nodes)) |
||||
|
||||
#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) |
||||
#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) |
||||
|
||||
/*
|
||||
* Number of FSM "slots" on a FSM page. This is what should be used |
||||
* outside fsmpage.c. |
||||
*/ |
||||
#define SlotsPerFSMPage LeafNodesPerPage |
||||
|
||||
/* Prototypes for functions in fsmpage.c */ |
||||
extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext, |
||||
bool exclusive_lock_held); |
||||
extern uint8 fsm_get_avail(Page page, int slot); |
||||
extern uint8 fsm_get_max_avail(Page page); |
||||
extern bool fsm_set_avail(Page page, int slot, uint8 value); |
||||
extern bool fsm_truncate_avail(Page page, int nslots); |
||||
extern bool fsm_rebuild_page(Page page); |
||||
|
||||
#endif /* FSM_INTERNALS_H */ |
@ -0,0 +1,27 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* indexfsm.h |
||||
* POSTGRES free space map for quickly finding an unused page in index |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* $PostgreSQL: pgsql/src/include/storage/indexfsm.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef INDEXFSM_H_ |
||||
#define INDEXFSM_H_ |
||||
|
||||
#include "utils/rel.h" |
||||
|
||||
extern void InitIndexFreeSpaceMap(Relation rel); |
||||
|
||||
extern BlockNumber GetFreeIndexPage(Relation rel); |
||||
extern void RecordFreeIndexPage(Relation rel, BlockNumber page); |
||||
extern void RecordUsedIndexPage(Relation rel, BlockNumber page); |
||||
|
||||
extern void IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks); |
||||
|
||||
#endif /* INDEXFSM_H */ |
Loading…
Reference in new issue