mirror of https://github.com/postgres/postgres
free space information is stored in a dedicated FSM relation fork, with each relation (except for hash indexes; they don't use FSM). This eliminates the max_fsm_relations and max_fsm_pages GUC options; remove any trace of them from the backend, initdb, and documentation. Rewrite contrib/pg_freespacemap to match the new FSM implementation. Also introduce a new variant of the get_raw_page(regclass, int4, int4) function in contrib/pageinspect that let's you to return pages from any relation fork, and a new fsm_page_contents() function to inspect the new FSM pages.REL8_5_ALPHA1_BRANCH
parent
2dbc0ca937
commit
15c121b3ed
@ -0,0 +1,61 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* fsmfuncs.c |
||||||
|
* Functions to investigate FSM pages |
||||||
|
* |
||||||
|
* These functions are restricted to superusers for the fear of introducing |
||||||
|
* security holes if the input checking isn't as water-tight as it should. |
||||||
|
* You'd need to be superuser to obtain a raw page image anyway, so |
||||||
|
* there's hardly any use case for using these without superuser-rights |
||||||
|
* anyway. |
||||||
|
* |
||||||
|
* Copyright (c) 2007-2008, PostgreSQL Global Development Group |
||||||
|
* |
||||||
|
* IDENTIFICATION |
||||||
|
* $PostgreSQL: pgsql/contrib/pageinspect/fsmfuncs.c,v 1.1 2008/09/30 10:52:09 heikki Exp $ |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "postgres.h" |
||||||
|
#include "lib/stringinfo.h" |
||||||
|
#include "storage/fsm_internals.h" |
||||||
|
#include "utils/builtins.h" |
||||||
|
#include "miscadmin.h" |
||||||
|
#include "funcapi.h" |
||||||
|
|
||||||
|
Datum fsm_page_contents(PG_FUNCTION_ARGS); |
||||||
|
|
||||||
|
/*
|
||||||
|
* Dumps the contents of a FSM page. |
||||||
|
*/ |
||||||
|
PG_FUNCTION_INFO_V1(fsm_page_contents); |
||||||
|
|
||||||
|
Datum |
||||||
|
fsm_page_contents(PG_FUNCTION_ARGS) |
||||||
|
{ |
||||||
|
bytea *raw_page = PG_GETARG_BYTEA_P(0); |
||||||
|
int raw_page_size; |
||||||
|
StringInfoData sinfo; |
||||||
|
FSMPage fsmpage; |
||||||
|
int i; |
||||||
|
|
||||||
|
if (!superuser()) |
||||||
|
ereport(ERROR, |
||||||
|
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), |
||||||
|
(errmsg("must be superuser to use raw page functions")))); |
||||||
|
|
||||||
|
raw_page_size = VARSIZE(raw_page) - VARHDRSZ; |
||||||
|
fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page)); |
||||||
|
|
||||||
|
initStringInfo(&sinfo); |
||||||
|
|
||||||
|
for(i=0; i < NodesPerPage; i++) |
||||||
|
{ |
||||||
|
if (fsmpage->fp_nodes[i] != 0) |
||||||
|
appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]); |
||||||
|
} |
||||||
|
appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot); |
||||||
|
|
||||||
|
PG_RETURN_TEXT_P(cstring_to_text(sinfo.data)); |
||||||
|
} |
@ -1,44 +1,26 @@ |
|||||||
/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.8 2007/11/13 04:24:28 momjian Exp $ */ |
/* $PostgreSQL: pgsql/contrib/pg_freespacemap/pg_freespacemap.sql.in,v 1.9 2008/09/30 10:52:09 heikki Exp $ */ |
||||||
|
|
||||||
-- Adjust this setting to control where the objects get created. |
-- Adjust this setting to control where the objects get created. |
||||||
SET search_path = public; |
SET search_path = public; |
||||||
|
|
||||||
|
|
||||||
-- Register the functions. |
-- Register the C function. |
||||||
CREATE OR REPLACE FUNCTION pg_freespacemap_pages() |
CREATE OR REPLACE FUNCTION pg_freespace(regclass, int4) |
||||||
RETURNS SETOF RECORD |
RETURNS int2 |
||||||
AS 'MODULE_PATHNAME', 'pg_freespacemap_pages' |
AS 'MODULE_PATHNAME', 'pg_freespace' |
||||||
LANGUAGE C; |
LANGUAGE C; |
||||||
|
|
||||||
CREATE OR REPLACE FUNCTION pg_freespacemap_relations() |
-- pg_freespace shows the recorded space avail at each block in a relation |
||||||
|
CREATE OR REPLACE FUNCTION |
||||||
|
pg_freespace(rel regclass, blkno OUT int4, avail OUT int2) |
||||||
RETURNS SETOF RECORD |
RETURNS SETOF RECORD |
||||||
AS 'MODULE_PATHNAME', 'pg_freespacemap_relations' |
AS $$ |
||||||
LANGUAGE C; |
SELECT blkno::int4, pg_freespace($1, blkno::int4) AS avail |
||||||
|
FROM generate_series(0, pg_relation_size($1) / current_setting('block_size')::bigint - 1) AS blkno; |
||||||
|
$$ |
||||||
-- Create views for convenient access. |
LANGUAGE SQL; |
||||||
CREATE VIEW pg_freespacemap_pages AS |
|
||||||
SELECT P.* FROM pg_freespacemap_pages() AS P |
|
||||||
(reltablespace oid, |
|
||||||
reldatabase oid, |
|
||||||
relfilenode oid, |
|
||||||
relblocknumber bigint, |
|
||||||
bytes integer); |
|
||||||
|
|
||||||
CREATE VIEW pg_freespacemap_relations AS |
|
||||||
SELECT P.* FROM pg_freespacemap_relations() AS P |
|
||||||
(reltablespace oid, |
|
||||||
reldatabase oid, |
|
||||||
relfilenode oid, |
|
||||||
avgrequest integer, |
|
||||||
interestingpages integer, |
|
||||||
storedpages integer, |
|
||||||
nextpage integer); |
|
||||||
|
|
||||||
|
|
||||||
-- Don't want these to be available to public. |
-- Don't want these to be available to public. |
||||||
REVOKE ALL ON FUNCTION pg_freespacemap_pages() FROM PUBLIC; |
REVOKE ALL ON FUNCTION pg_freespace(regclass, int4) FROM PUBLIC; |
||||||
REVOKE ALL ON pg_freespacemap_pages FROM PUBLIC; |
REVOKE ALL ON FUNCTION pg_freespace(regclass) FROM PUBLIC; |
||||||
|
|
||||||
REVOKE ALL ON FUNCTION pg_freespacemap_relations() FROM PUBLIC; |
|
||||||
REVOKE ALL ON pg_freespacemap_relations FROM PUBLIC; |
|
||||||
|
@ -0,0 +1,195 @@ |
|||||||
|
$PostgreSQL: pgsql/src/backend/storage/freespace/README,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||||
|
|
||||||
|
Free Space Map |
||||||
|
-------------- |
||||||
|
|
||||||
|
The purpose of the free space map is to quickly locate a page with enough |
||||||
|
free space to hold a tuple to be stored; or to determine that no such page |
||||||
|
exists and the relation must be extended by one page. As of PostgreSQL 8.4 |
||||||
|
each relation has its own, extensible free space map stored in a separate |
||||||
|
"fork" of its relation. This eliminates the disadvantages of the former |
||||||
|
fixed-size FSM. |
||||||
|
|
||||||
|
It is important to keep the map small so that it can be searched rapidly. |
||||||
|
Therefore, we don't attempt to record the exact free space on a page. |
||||||
|
We allocate one map byte to each page, allowing us to record free space |
||||||
|
at a granularity of 1/256th of a page. Another way to say it is that |
||||||
|
the stored value is the free space divided by BLCKSZ/256 (rounding down). |
||||||
|
We assume that the free space must always be less than BLCKSZ, since |
||||||
|
all pages have some overhead; so the maximum map value is 255. |
||||||
|
|
||||||
|
To assist in fast searching, the map isn't simply an array of per-page |
||||||
|
entries, but has a tree structure above those entries. There is a tree |
||||||
|
structure of pages, and a tree structure within each page, as described |
||||||
|
below. |
||||||
|
|
||||||
|
FSM page structure |
||||||
|
------------------ |
||||||
|
|
||||||
|
Within each FSM page, we use a binary tree structure where leaf nodes store |
||||||
|
the amount of free space on heap pages (or lower level FSM pages, see |
||||||
|
"Higher-level structure" below), with one leaf node per heap page. A non-leaf |
||||||
|
node stores the max amount of free space on any of its children. |
||||||
|
|
||||||
|
For example: |
||||||
|
|
||||||
|
4 |
||||||
|
4 2 |
||||||
|
3 4 0 2 <- This level represents heap pages |
||||||
|
|
||||||
|
We need two basic operations: search and update. |
||||||
|
|
||||||
|
To search for a page with X amount of free space, traverse down the tree |
||||||
|
along a path where n >= X, until you hit the bottom. If both children of a |
||||||
|
node satisfy the condition, you can pick either one arbitrarily. |
||||||
|
|
||||||
|
To update the amount of free space on a page to X, first update the leaf node |
||||||
|
corresponding to the heap page, then "bubble up" the change to upper nodes, |
||||||
|
by walking up to each parent and recomputing its value as the max of its |
||||||
|
two children. Repeat until reaching the root or a parent whose value |
||||||
|
doesn't change. |
||||||
|
|
||||||
|
This data structure has a couple of nice properties: |
||||||
|
- to discover that there is no page with X bytes of free space, you only |
||||||
|
need to look at the root node |
||||||
|
- by varying which child to traverse to in the search algorithm, when you have |
||||||
|
a choice, we can implement various strategies, like preferring pages closer |
||||||
|
to a given page, or spreading the load across the table. |
||||||
|
|
||||||
|
Higher-level routines that use FSM pages access them through the fsm_set_avail() |
||||||
|
and fsm_search_avail() functions. The interface to those functions hides the |
||||||
|
page's internal tree structure, treating the FSM page as a black box that has |
||||||
|
a certain number of "slots" for storing free space information. (However, |
||||||
|
the higher routines have to be aware of the tree structure of the whole map.) |
||||||
|
|
||||||
|
The binary tree is stored on each FSM page as an array. Because the page |
||||||
|
header takes some space on a page, the binary tree isn't perfect. That is, |
||||||
|
a few right-most leaf nodes are missing, and there are some useless non-leaf |
||||||
|
nodes at the right. So the tree looks something like this: |
||||||
|
|
||||||
|
0 |
||||||
|
1 2 |
||||||
|
3 4 5 6 |
||||||
|
7 8 9 A B |
||||||
|
|
||||||
|
where the numbers denote each node's position in the array. Note that the |
||||||
|
tree is guaranteed complete above the leaf level; only some leaf nodes are |
||||||
|
missing. This is reflected in the number of usable "slots" per page not |
||||||
|
being an exact power of 2. |
||||||
|
|
||||||
|
A FSM page also has a next slot pointer, fp_next_slot, that determines where |
||||||
|
to start the next search for free space within that page. The reason for that |
||||||
|
is to spread out the pages that are returned by FSM searches. When several |
||||||
|
backends are concurrently inserting into a relation, contention can be avoided |
||||||
|
by having them insert into different pages. But it is also desirable to fill |
||||||
|
up pages in sequential order, to get the benefit of OS prefetching and batched |
||||||
|
writes. The FSM is responsible for making that happen, and the next slot |
||||||
|
pointer helps provide the desired behavior. |
||||||
|
|
||||||
|
Higher-level structure |
||||||
|
---------------------- |
||||||
|
|
||||||
|
To scale up the data structure described above beyond a single page, we |
||||||
|
maintain a similar tree-structure across pages. Leaf nodes in higher level |
||||||
|
pages correspond to lower level FSM pages. The root node within each page |
||||||
|
has the same value as the corresponding leaf node on its parent page. |
||||||
|
|
||||||
|
The root page is always stored at physical block 0. |
||||||
|
|
||||||
|
For example, assuming each FSM page can hold information about 4 pages (in |
||||||
|
reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ), |
||||||
|
we get a disk layout like this: |
||||||
|
|
||||||
|
0 <-- page 0 at level 2 (root page) |
||||||
|
0 <-- page 0 at level 1 |
||||||
|
0 <-- page 0 at level 0 |
||||||
|
1 <-- page 1 at level 0 |
||||||
|
2 <-- ... |
||||||
|
3 |
||||||
|
1 <-- page 1 at level 1 |
||||||
|
4 |
||||||
|
5 |
||||||
|
6 |
||||||
|
7 |
||||||
|
2 |
||||||
|
8 |
||||||
|
9 |
||||||
|
10 |
||||||
|
11 |
||||||
|
3 |
||||||
|
12 |
||||||
|
13 |
||||||
|
14 |
||||||
|
15 |
||||||
|
|
||||||
|
where the numbers are page numbers *at that level*, starting from 0. |
||||||
|
|
||||||
|
To find the physical block # corresponding to leaf page n, we need to |
||||||
|
count the number number of leaf and upper-level pages preceding page n. |
||||||
|
This turns out to be |
||||||
|
|
||||||
|
y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1 |
||||||
|
|
||||||
|
where F is the fanout (4 in the above example). The first term n is the number |
||||||
|
of preceding leaf pages, the second term is the number of pages at level 1, |
||||||
|
and so forth. |
||||||
|
|
||||||
|
To keep things simple, the tree is always constant height. To cover the |
||||||
|
maximum relation size of 2^32-1 blocks, three levels is enough with the default |
||||||
|
BLCKSZ (4000^3 > 2^32). |
||||||
|
|
||||||
|
Addressing |
||||||
|
---------- |
||||||
|
|
||||||
|
The higher-level routines operate on "logical" addresses, consisting of |
||||||
|
- level, |
||||||
|
- logical page number, and |
||||||
|
- slot (if applicable) |
||||||
|
|
||||||
|
Bottom level FSM pages have level of 0, the level above that 1, and root 2. |
||||||
|
As in the diagram above, logical page number is the page number at that level, |
||||||
|
starting from 0. |
||||||
|
|
||||||
|
Locking |
||||||
|
------- |
||||||
|
|
||||||
|
When traversing down to search for free space, only one page is locked at a |
||||||
|
time: the parent page is released before locking the child. If the child page |
||||||
|
is concurrently modified, and there no longer is free space on the child page |
||||||
|
when you land on it, you need to start from scratch (after correcting the |
||||||
|
parent page, so that you don't get into an infinite loop). |
||||||
|
|
||||||
|
We use shared buffer locks when searching, but exclusive buffer lock when |
||||||
|
updating a page. However, the next slot search pointer is updated during |
||||||
|
searches even though we have only a shared lock. fp_next_slot is just a hint |
||||||
|
and we can easily reset it if it gets corrupted; so it seems better to accept |
||||||
|
some risk of that type than to pay the overhead of exclusive locking. |
||||||
|
|
||||||
|
Recovery |
||||||
|
-------- |
||||||
|
|
||||||
|
The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of |
||||||
|
self-correcting measures to repair possible corruption. |
||||||
|
|
||||||
|
First of all, whenever a value is set on an FSM page, the root node of the |
||||||
|
page is compared against the new value after bubbling up the change is |
||||||
|
finished. It should be greater than or equal to the value just set, or we |
||||||
|
have a corrupted page, with a parent somewhere with too small a value. |
||||||
|
Secondly, if we detect corrupted pages while we search, traversing down |
||||||
|
the tree. That check will notice if a parent node is set to too high a value. |
||||||
|
In both cases, the upper nodes on the page are immediately rebuilt, fixing |
||||||
|
the corruption. |
||||||
|
|
||||||
|
Vacuum updates all the bottom level pages with correct amount of free space |
||||||
|
on the heap pages, fixing any outdated values there. After the heap and |
||||||
|
index passes are done, FreeSpaceMapVacuum is called, and the FSM tree is |
||||||
|
scanned in depth-first order. This fixes any discrepancies between upper |
||||||
|
and lower level FSM pages. |
||||||
|
|
||||||
|
TODO |
||||||
|
---- |
||||||
|
|
||||||
|
- fastroot to avoid traversing upper nodes with just 1 child |
||||||
|
- use a different system for tables that fit into one FSM page, with a |
||||||
|
mechanism to switch to the real thing as it grows. |
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,352 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* fsmpage.c |
||||||
|
* routines to search and manipulate one FSM page. |
||||||
|
* |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* IDENTIFICATION |
||||||
|
* $PostgreSQL: pgsql/src/backend/storage/freespace/fsmpage.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||||
|
* |
||||||
|
* NOTES: |
||||||
|
* |
||||||
|
* The public functions in this file form an API that hides the internal |
||||||
|
* structure of a FSM page. This allows freespace.c to treat each FSM page |
||||||
|
* as a black box with SlotsPerPage "slots". fsm_set_avail() and |
||||||
|
* fsm_get_avail() let's you get/set the value of a slot, and |
||||||
|
* fsm_search_avail() let's you search for a slot with value >= X. |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#include "postgres.h" |
||||||
|
|
||||||
|
#include "storage/bufmgr.h" |
||||||
|
#include "storage/fsm_internals.h" |
||||||
|
|
||||||
|
/* macros to navigate the tree within a page. */ |
||||||
|
#define leftchild(x) (2 * (x) + 1) |
||||||
|
#define rightchild(x) (2 * (x) + 2) |
||||||
|
#define parentof(x) (((x) - 1) / 2) |
||||||
|
|
||||||
|
/* returns right sibling of x, wrapping around within the level */ |
||||||
|
static int |
||||||
|
rightsibling(int x) |
||||||
|
{ |
||||||
|
/*
|
||||||
|
* Move right. This might wrap around, stepping to the leftmost node at |
||||||
|
* the next level. |
||||||
|
*/ |
||||||
|
x++; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if we stepped to the leftmost node at next level, and correct |
||||||
|
* if so. The leftmost nodes at each level are of form x = 2^level - 1, so |
||||||
|
* check if (x + 1) is a power of two. |
||||||
|
*/ |
||||||
|
if (((x + 1) & x) == 0) |
||||||
|
x = parentof(x); |
||||||
|
|
||||||
|
return x; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Sets the value of a slot on page. Returns true if the page was |
||||||
|
* modified. |
||||||
|
* |
||||||
|
* The caller must hold an exclusive lock on the page. |
||||||
|
*/ |
||||||
|
bool |
||||||
|
fsm_set_avail(Page page, int slot, uint8 value) |
||||||
|
{ |
||||||
|
int nodeno = NonLeafNodesPerPage + slot; |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
uint8 oldvalue; |
||||||
|
|
||||||
|
Assert(slot < LeafNodesPerPage); |
||||||
|
|
||||||
|
oldvalue = fsmpage->fp_nodes[nodeno]; |
||||||
|
|
||||||
|
/* If the value hasn't changed, we don't need to do anything */ |
||||||
|
if (oldvalue == value && value <= fsmpage->fp_nodes[0]) |
||||||
|
return false; |
||||||
|
|
||||||
|
fsmpage->fp_nodes[nodeno] = value; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Propagate up, until we hit the root or a node that doesn't |
||||||
|
* need to be updated. |
||||||
|
*/ |
||||||
|
do |
||||||
|
{ |
||||||
|
uint8 newvalue = 0; |
||||||
|
int lchild; |
||||||
|
int rchild; |
||||||
|
|
||||||
|
nodeno = parentof(nodeno); |
||||||
|
lchild = leftchild(nodeno); |
||||||
|
rchild = lchild + 1; |
||||||
|
|
||||||
|
newvalue = fsmpage->fp_nodes[lchild]; |
||||||
|
if (rchild < NodesPerPage) |
||||||
|
newvalue = Max(newvalue, |
||||||
|
fsmpage->fp_nodes[rchild]); |
||||||
|
|
||||||
|
oldvalue = fsmpage->fp_nodes[nodeno]; |
||||||
|
if (oldvalue == newvalue) |
||||||
|
break; |
||||||
|
|
||||||
|
fsmpage->fp_nodes[nodeno] = newvalue; |
||||||
|
} while (nodeno > 0); |
||||||
|
|
||||||
|
/*
|
||||||
|
* sanity check: if the new value value is higher than the value |
||||||
|
* at the top, the tree is corrupt. |
||||||
|
*/ |
||||||
|
if (value > fsmpage->fp_nodes[0]) |
||||||
|
fsm_rebuild_page(page); |
||||||
|
|
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the value of given slot on page. |
||||||
|
* |
||||||
|
* Since this is just a read-only access of a single byte, the page doesn't |
||||||
|
* need to be locked. |
||||||
|
*/ |
||||||
|
uint8 |
||||||
|
fsm_get_avail(Page page, int slot) |
||||||
|
{ |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
|
||||||
|
return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns the value at the root of a page. |
||||||
|
* Since this is just a read-only access of a single byte, the page doesn't |
||||||
|
* need to be locked. |
||||||
|
*/ |
||||||
|
uint8 |
||||||
|
fsm_get_max_avail(Page page) |
||||||
|
{ |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
return fsmpage->fp_nodes[0]; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Searches for a slot with min. category. Returns slot number, or -1 if
|
||||||
|
* none found. |
||||||
|
* |
||||||
|
* The caller must hold at least a shared lock on the page, and this |
||||||
|
* function can unlock and lock the page again in exclusive mode if it |
||||||
|
* needs to be updated. exclusive_lock_held should be set to true if the |
||||||
|
* caller is already holding an exclusive lock, to avoid extra work. |
||||||
|
* |
||||||
|
* If advancenext is false, fp_next_slot is set to point to the returned |
||||||
|
* slot, and if it's true, to the slot next to the returned slot. |
||||||
|
*/ |
||||||
|
int |
||||||
|
fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, |
||||||
|
bool exclusive_lock_held) |
||||||
|
{ |
||||||
|
Page page = BufferGetPage(buf); |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
int nodeno; |
||||||
|
int target; |
||||||
|
uint16 slot; |
||||||
|
|
||||||
|
restart: |
||||||
|
/*
|
||||||
|
* Check the root first, and exit quickly if there's no page with |
||||||
|
* enough free space |
||||||
|
*/ |
||||||
|
if (fsmpage->fp_nodes[0] < minvalue) |
||||||
|
return -1; |
||||||
|
|
||||||
|
|
||||||
|
/* fp_next_slot is just a hint, so check that it's sane */ |
||||||
|
target = fsmpage->fp_next_slot; |
||||||
|
if (target < 0 || target >= LeafNodesPerPage) |
||||||
|
target = 0; |
||||||
|
target += NonLeafNodesPerPage; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Start the search from the target slot. At every step, move one |
||||||
|
* node to the right, and climb up to the parent. Stop when we reach a |
||||||
|
* node with enough free space. (note that moving to the right only |
||||||
|
* makes a difference if we're on the right child of the parent) |
||||||
|
* |
||||||
|
* The idea is to graduall expand our "search triangle", that is, all |
||||||
|
* nodes covered by the current node. In the beginning, just the target |
||||||
|
* node is included, and more nodes to the right of the target node, |
||||||
|
* taking wrap-around into account, is included at each step. Nodes are |
||||||
|
* added to the search triangle in left-to-right order, starting from |
||||||
|
* the target node. This ensures that we'll find the first suitable node |
||||||
|
* to the right of the target node, and not some other node with enough |
||||||
|
* free space. |
||||||
|
* |
||||||
|
* For example, consider this tree: |
||||||
|
* |
||||||
|
* 7 |
||||||
|
* 7 6 |
||||||
|
* 5 7 6 5 |
||||||
|
* 4 5 5 7 2 6 5 2 |
||||||
|
* T |
||||||
|
* |
||||||
|
* Imagine that target node is the node indicated by the letter T, and |
||||||
|
* we're searching for a node with value of 6 or higher. The search |
||||||
|
* begins at T. At first iteration, we move to the right, and to the |
||||||
|
* parent, arriving the rightmost 5. At the 2nd iteration, we move to the |
||||||
|
* right, wrapping around, and climb up, arriving at the 7 at the 2nd |
||||||
|
* level. 7 satisfies our search, so we descend down to the bottom, |
||||||
|
* following the path of sevens. |
||||||
|
*/ |
||||||
|
nodeno = target; |
||||||
|
while (nodeno > 0) |
||||||
|
{ |
||||||
|
if (fsmpage->fp_nodes[nodeno] >= minvalue) |
||||||
|
break; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Move to the right, wrapping around at the level if necessary, and |
||||||
|
* climb up. |
||||||
|
*/ |
||||||
|
nodeno = parentof(rightsibling(nodeno)); |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* We're now at a node with enough free space, somewhere in the middle of |
||||||
|
* the tree. Descend to the bottom, following a path with enough free |
||||||
|
* space, preferring to move left if there's a choice. |
||||||
|
*/ |
||||||
|
while (nodeno < NonLeafNodesPerPage) |
||||||
|
{ |
||||||
|
int leftnodeno = leftchild(nodeno); |
||||||
|
int rightnodeno = leftnodeno + 1; |
||||||
|
bool leftok = (leftnodeno < NodesPerPage) && |
||||||
|
(fsmpage->fp_nodes[leftnodeno] >= minvalue); |
||||||
|
bool rightok = (rightnodeno < NodesPerPage) && |
||||||
|
(fsmpage->fp_nodes[rightnodeno] >= minvalue); |
||||||
|
|
||||||
|
if (leftok) |
||||||
|
nodeno = leftnodeno; |
||||||
|
else if (rightok) |
||||||
|
nodeno = rightnodeno; |
||||||
|
else |
||||||
|
{ |
||||||
|
/*
|
||||||
|
* Oops. The parent node promised that either left or right |
||||||
|
* child has enough space, but neither actually did. This can |
||||||
|
* happen in case of a "torn page", IOW if we crashed earlier |
||||||
|
* while writing the page to disk, and only part of the page |
||||||
|
* made it to disk. |
||||||
|
* |
||||||
|
* Fix the corruption and restart. |
||||||
|
*/ |
||||||
|
RelFileNode rnode; |
||||||
|
ForkNumber forknum; |
||||||
|
BlockNumber blknum; |
||||||
|
|
||||||
|
BufferGetTag(buf, &rnode, &forknum, &blknum); |
||||||
|
elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", |
||||||
|
blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); |
||||||
|
|
||||||
|
/* make sure we hold an exclusive lock */ |
||||||
|
if (!exclusive_lock_held) |
||||||
|
{ |
||||||
|
LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
||||||
|
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
||||||
|
exclusive_lock_held = true; |
||||||
|
} |
||||||
|
fsm_rebuild_page(page); |
||||||
|
MarkBufferDirty(buf); |
||||||
|
goto restart; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* We're now at the bottom level, at a node with enough space. */ |
||||||
|
slot = nodeno - NonLeafNodesPerPage; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Update the next slot pointer. Note that we do this even if we're only |
||||||
|
* holding a shared lock, on the grounds that it's better to use a shared |
||||||
|
* lock and get a garbled next pointer every now and then, than take the |
||||||
|
* concurrency hit of an exlusive lock. |
||||||
|
* |
||||||
|
* Wrap-around is handled at the beginning of this function. |
||||||
|
*/ |
||||||
|
fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); |
||||||
|
|
||||||
|
return slot; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Sets the available space to zero for all slots numbered >= nslots. |
||||||
|
* Returns true if the page was modified. |
||||||
|
*/ |
||||||
|
bool |
||||||
|
fsm_truncate_avail(Page page, int nslots) |
||||||
|
{ |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
uint8 *ptr; |
||||||
|
bool changed = false; |
||||||
|
|
||||||
|
Assert(nslots >= 0 && nslots < LeafNodesPerPage); |
||||||
|
|
||||||
|
/* Clear all truncated leaf nodes */ |
||||||
|
ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; |
||||||
|
for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) |
||||||
|
{ |
||||||
|
if (*ptr != 0) |
||||||
|
changed = true; |
||||||
|
*ptr = 0; |
||||||
|
} |
||||||
|
|
||||||
|
/* Fix upper nodes. */ |
||||||
|
if (changed) |
||||||
|
fsm_rebuild_page(page); |
||||||
|
|
||||||
|
return changed; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* Reconstructs the upper levels of a page. Returns true if the page |
||||||
|
* was modified. |
||||||
|
*/ |
||||||
|
bool |
||||||
|
fsm_rebuild_page(Page page) |
||||||
|
{ |
||||||
|
FSMPage fsmpage = (FSMPage) PageGetContents(page); |
||||||
|
bool changed = false; |
||||||
|
int nodeno; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Start from the lowest non-leaflevel, at last node, working our way |
||||||
|
* backwards, through all non-leaf nodes at all levels, up to the root. |
||||||
|
*/ |
||||||
|
for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) |
||||||
|
{ |
||||||
|
int lchild = leftchild(nodeno); |
||||||
|
int rchild = lchild + 1; |
||||||
|
uint8 newvalue = 0; |
||||||
|
|
||||||
|
if (lchild < NodesPerPage) |
||||||
|
newvalue = fsmpage->fp_nodes[lchild]; |
||||||
|
|
||||||
|
if (rchild < NodesPerPage) |
||||||
|
newvalue = Max(newvalue, |
||||||
|
fsmpage->fp_nodes[rchild]); |
||||||
|
|
||||||
|
if (fsmpage->fp_nodes[nodeno] != newvalue) |
||||||
|
{ |
||||||
|
fsmpage->fp_nodes[nodeno] = newvalue; |
||||||
|
changed = true; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return changed; |
||||||
|
} |
||||||
|
|
@ -0,0 +1,92 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* indexfsm.c |
||||||
|
* POSTGRES free space map for quickly finding free pages in relations |
||||||
|
* |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* IDENTIFICATION |
||||||
|
* $PostgreSQL: pgsql/src/backend/storage/freespace/indexfsm.c,v 1.1 2008/09/30 10:52:13 heikki Exp $ |
||||||
|
* |
||||||
|
* |
||||||
|
* NOTES: |
||||||
|
* |
||||||
|
* This is similar to the FSM used for heap, in freespace.c, but instead |
||||||
|
* of tracking the amount of free space on pages, we only track whether |
||||||
|
* pages are completely free or in-use. We use the same FSM implementation |
||||||
|
* as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused. |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#include "postgres.h" |
||||||
|
|
||||||
|
#include "storage/freespace.h" |
||||||
|
#include "storage/indexfsm.h" |
||||||
|
#include "storage/smgr.h" |
||||||
|
|
||||||
|
/*
|
||||||
|
* Exported routines |
||||||
|
*/ |
||||||
|
|
||||||
|
/*
|
||||||
|
* InitIndexFreeSpaceMap - Create or reset the FSM fork for relation. |
||||||
|
*/ |
||||||
|
void |
||||||
|
InitIndexFreeSpaceMap(Relation rel) |
||||||
|
{ |
||||||
|
/* Create FSM fork if it doesn't exist yet, or truncate it if it does */ |
||||||
|
RelationOpenSmgr(rel); |
||||||
|
if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) |
||||||
|
smgrcreate(rel->rd_smgr, FSM_FORKNUM, rel->rd_istemp, false); |
||||||
|
else |
||||||
|
smgrtruncate(rel->rd_smgr, FSM_FORKNUM, 0, rel->rd_istemp); |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* GetFreeIndexPage - return a free page from the FSM |
||||||
|
* |
||||||
|
* As a side effect, the page is marked as used in the FSM. |
||||||
|
*/ |
||||||
|
BlockNumber |
||||||
|
GetFreeIndexPage(Relation rel) |
||||||
|
{ |
||||||
|
BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ/2); |
||||||
|
|
||||||
|
if (blkno != InvalidBlockNumber) |
||||||
|
RecordUsedIndexPage(rel, blkno); |
||||||
|
|
||||||
|
return blkno; |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* RecordFreeIndexPage - mark a page as free in the FSM |
||||||
|
*/ |
||||||
|
void |
||||||
|
RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) |
||||||
|
{ |
||||||
|
RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RecordUsedIndexPage - mark a page as used in the FSM |
||||||
|
*/ |
||||||
|
void |
||||||
|
RecordUsedIndexPage(Relation rel, BlockNumber usedBlock) |
||||||
|
{ |
||||||
|
RecordPageWithFreeSpace(rel, usedBlock, 0); |
||||||
|
} |
||||||
|
|
||||||
|
/*
|
||||||
|
* IndexFreeSpaceMapTruncate - adjust for truncation of a relation. |
||||||
|
* |
||||||
|
* We need to delete any stored data past the new relation length, so that |
||||||
|
* we don't bogusly return removed block numbers. |
||||||
|
*/ |
||||||
|
void |
||||||
|
IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks) |
||||||
|
{ |
||||||
|
FreeSpaceMapTruncateRel(rel, nblocks); |
||||||
|
} |
@ -0,0 +1,73 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* fsm_internal.h |
||||||
|
* internal functions for free space map |
||||||
|
* |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* $PostgreSQL: pgsql/src/include/storage/fsm_internals.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#ifndef FSM_INTERNALS_H |
||||||
|
#define FSM_INTERNALS_H |
||||||
|
|
||||||
|
#include "storage/buf.h" |
||||||
|
#include "storage/bufpage.h" |
||||||
|
#include "lib/stringinfo.h" |
||||||
|
|
||||||
|
/*
|
||||||
|
* Structure of a FSM page. See src/backend/storage/freespace/README for |
||||||
|
* details. |
||||||
|
*/ |
||||||
|
typedef struct |
||||||
|
{ |
||||||
|
/*
|
||||||
|
* fsm_search_avail() tries to spread the load of multiple backends |
||||||
|
* by returning different pages to different backends in a round-robin |
||||||
|
* fashion. fp_next_slot points to the next slot to be returned |
||||||
|
* (assuming there's enough space on it for the request). It's defined |
||||||
|
* as an int, because it's updated without an exclusive lock. uint16 |
||||||
|
* would be more appropriate, but int is more likely to be atomically |
||||||
|
* fetchable/storable. |
||||||
|
*/ |
||||||
|
int fp_next_slot; |
||||||
|
|
||||||
|
/*
|
||||||
|
* fp_nodes contains the binary tree, stored in array. The first |
||||||
|
* NonLeafNodesPerPage elements are upper nodes, and the following |
||||||
|
* LeafNodesPerPage elements are leaf nodes. Unused nodes are zero. |
||||||
|
*/ |
||||||
|
uint8 fp_nodes[1]; |
||||||
|
} FSMPageData; |
||||||
|
|
||||||
|
typedef FSMPageData *FSMPage; |
||||||
|
|
||||||
|
/*
|
||||||
|
* Number of non-leaf and leaf nodes, and nodes in total, on an FSM page. |
||||||
|
* These definitions are internal to fsmpage.c. |
||||||
|
*/ |
||||||
|
#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ |
||||||
|
offsetof(FSMPageData, fp_nodes)) |
||||||
|
|
||||||
|
#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) |
||||||
|
#define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) |
||||||
|
|
||||||
|
/*
|
||||||
|
* Number of FSM "slots" on a FSM page. This is what should be used |
||||||
|
* outside fsmpage.c. |
||||||
|
*/ |
||||||
|
#define SlotsPerFSMPage LeafNodesPerPage |
||||||
|
|
||||||
|
/* Prototypes for functions in fsmpage.c */ |
||||||
|
extern int fsm_search_avail(Buffer buf, uint8 min_cat, bool advancenext, |
||||||
|
bool exclusive_lock_held); |
||||||
|
extern uint8 fsm_get_avail(Page page, int slot); |
||||||
|
extern uint8 fsm_get_max_avail(Page page); |
||||||
|
extern bool fsm_set_avail(Page page, int slot, uint8 value); |
||||||
|
extern bool fsm_truncate_avail(Page page, int nslots); |
||||||
|
extern bool fsm_rebuild_page(Page page); |
||||||
|
|
||||||
|
#endif /* FSM_INTERNALS_H */ |
@ -0,0 +1,27 @@ |
|||||||
|
/*-------------------------------------------------------------------------
|
||||||
|
* |
||||||
|
* indexfsm.h |
||||||
|
* POSTGRES free space map for quickly finding an unused page in index |
||||||
|
* |
||||||
|
* |
||||||
|
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
||||||
|
* Portions Copyright (c) 1994, Regents of the University of California |
||||||
|
* |
||||||
|
* $PostgreSQL: pgsql/src/include/storage/indexfsm.h,v 1.1 2008/09/30 10:52:14 heikki Exp $ |
||||||
|
* |
||||||
|
*------------------------------------------------------------------------- |
||||||
|
*/ |
||||||
|
#ifndef INDEXFSM_H_ |
||||||
|
#define INDEXFSM_H_ |
||||||
|
|
||||||
|
#include "utils/rel.h" |
||||||
|
|
||||||
|
extern void InitIndexFreeSpaceMap(Relation rel); |
||||||
|
|
||||||
|
extern BlockNumber GetFreeIndexPage(Relation rel); |
||||||
|
extern void RecordFreeIndexPage(Relation rel, BlockNumber page); |
||||||
|
extern void RecordUsedIndexPage(Relation rel, BlockNumber page); |
||||||
|
|
||||||
|
extern void IndexFreeSpaceMapTruncate(Relation rel, BlockNumber nblocks); |
||||||
|
|
||||||
|
#endif /* INDEXFSM_H */ |
Loading…
Reference in new issue