mirror of https://github.com/postgres/postgres
This commit changes index-only scans so that data is read directly from the index tuple without first generating a faux heap tuple. The only immediate benefit is that indexes on system columns (such as OID) can be used in index-only scans, but this is necessary infrastructure if we are ever to support index-only scans on expression indexes. The executor is now ready for that, though the planner still needs substantial work to recognize the possibility. To do this, Vars in index-only plan nodes have to refer to index columns not heap columns. I introduced a new special varno, INDEX_VAR, to mark such Vars to avoid confusion. (In passing, this commit renames the two existing special varnos to OUTER_VAR and INNER_VAR.) This allows ruleutils.c to handle them with logic similar to what we use for subplan reference Vars. Since index-only scans are now fundamentally different from regular indexscans so far as their expression subtrees are concerned, I also chose to change them to have their own plan node type (and hence, their own executor source file).pull/1/head
parent
fa351d5a0d
commit
a0185461dd
@ -0,0 +1,542 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* nodeIndexonlyscan.c |
||||
* Routines to support index-only scans |
||||
* |
||||
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* |
||||
* IDENTIFICATION |
||||
* src/backend/executor/nodeIndexonlyscan.c |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
/*
|
||||
* INTERFACE ROUTINES |
||||
* ExecIndexOnlyScan scans an index |
||||
* IndexOnlyNext retrieve next tuple |
||||
* ExecInitIndexOnlyScan creates and initializes state info. |
||||
* ExecReScanIndexOnlyScan rescans the indexed relation. |
||||
* ExecEndIndexOnlyScan releases all storage. |
||||
* ExecIndexOnlyMarkPos marks scan position. |
||||
* ExecIndexOnlyRestrPos restores scan position. |
||||
*/ |
||||
#include "postgres.h" |
||||
|
||||
#include "access/relscan.h" |
||||
#include "access/visibilitymap.h" |
||||
#include "catalog/pg_opfamily.h" |
||||
#include "catalog/pg_type.h" |
||||
#include "executor/execdebug.h" |
||||
#include "executor/nodeIndexonlyscan.h" |
||||
#include "executor/nodeIndexscan.h" |
||||
#include "storage/bufmgr.h" |
||||
#include "utils/memutils.h" |
||||
#include "utils/rel.h" |
||||
|
||||
|
||||
static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); |
||||
static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, |
||||
Relation indexRel); |
||||
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* IndexOnlyNext |
||||
* |
||||
* Retrieve a tuple from the IndexOnlyScan node's index. |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
static TupleTableSlot * |
||||
IndexOnlyNext(IndexOnlyScanState *node) |
||||
{ |
||||
EState *estate; |
||||
ExprContext *econtext; |
||||
ScanDirection direction; |
||||
IndexScanDesc scandesc; |
||||
HeapTuple tuple; |
||||
TupleTableSlot *slot; |
||||
ItemPointer tid; |
||||
|
||||
/*
|
||||
* extract necessary information from index scan node |
||||
*/ |
||||
estate = node->ss.ps.state; |
||||
direction = estate->es_direction; |
||||
/* flip direction if this is an overall backward scan */ |
||||
if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) |
||||
{ |
||||
if (ScanDirectionIsForward(direction)) |
||||
direction = BackwardScanDirection; |
||||
else if (ScanDirectionIsBackward(direction)) |
||||
direction = ForwardScanDirection; |
||||
} |
||||
scandesc = node->ioss_ScanDesc; |
||||
econtext = node->ss.ps.ps_ExprContext; |
||||
slot = node->ss.ss_ScanTupleSlot; |
||||
|
||||
/*
|
||||
* OK, now that we have what we need, fetch the next tuple. |
||||
*/ |
||||
while ((tid = index_getnext_tid(scandesc, direction)) != NULL) |
||||
{ |
||||
/*
|
||||
* We can skip the heap fetch if the TID references a heap page on |
||||
* which all tuples are known visible to everybody. In any case, |
||||
* we'll use the index tuple not the heap tuple as the data source. |
||||
*/ |
||||
if (!visibilitymap_test(scandesc->heapRelation, |
||||
ItemPointerGetBlockNumber(tid), |
||||
&node->ioss_VMBuffer)) |
||||
{ |
||||
/*
|
||||
* Rats, we have to visit the heap to check visibility. |
||||
*/ |
||||
tuple = index_fetch_heap(scandesc); |
||||
if (tuple == NULL) |
||||
continue; /* no visible tuple, try next index entry */ |
||||
|
||||
/*
|
||||
* Only MVCC snapshots are supported here, so there should be no |
||||
* need to keep following the HOT chain once a visible entry has |
||||
* been found. If we did want to allow that, we'd need to keep |
||||
* more state to remember not to call index_getnext_tid next time. |
||||
*/ |
||||
if (scandesc->xs_continue_hot) |
||||
elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); |
||||
|
||||
/*
|
||||
* Note: at this point we are holding a pin on the heap page, as |
||||
* recorded in scandesc->xs_cbuf. We could release that pin now, |
||||
* but it's not clear whether it's a win to do so. The next index |
||||
* entry might require a visit to the same heap page. |
||||
*/ |
||||
} |
||||
|
||||
/*
|
||||
* Fill the scan tuple slot with data from the index. |
||||
*/ |
||||
StoreIndexTuple(slot, scandesc->xs_itup, scandesc->indexRelation); |
||||
|
||||
/*
|
||||
* If the index was lossy, we have to recheck the index quals. |
||||
* (Currently, this can never happen, but we should support the case |
||||
* for possible future use, eg with GiST indexes.) |
||||
*/ |
||||
if (scandesc->xs_recheck) |
||||
{ |
||||
econtext->ecxt_scantuple = slot; |
||||
ResetExprContext(econtext); |
||||
if (!ExecQual(node->indexqual, econtext, false)) |
||||
{ |
||||
/* Fails recheck, so drop it and loop back for another */ |
||||
InstrCountFiltered2(node, 1); |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
return slot; |
||||
} |
||||
|
||||
/*
|
||||
* if we get here it means the index scan failed so we are at the end of |
||||
* the scan.. |
||||
*/ |
||||
return ExecClearTuple(slot); |
||||
} |
||||
|
||||
/*
|
||||
* StoreIndexTuple |
||||
* Fill the slot with data from the index tuple. |
||||
* |
||||
* At some point this might be generally-useful functionality, but |
||||
* right now we don't need it elsewhere. |
||||
*/ |
||||
static void |
||||
StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, Relation indexRel) |
||||
{ |
||||
TupleDesc indexDesc = RelationGetDescr(indexRel); |
||||
int nindexatts = indexDesc->natts; |
||||
Datum *values = slot->tts_values; |
||||
bool *isnull = slot->tts_isnull; |
||||
int i; |
||||
|
||||
/*
|
||||
* Note: we must use the index relation's tupdesc in index_getattr, |
||||
* not the slot's tupdesc, because of index_descriptor_hack(). |
||||
*/ |
||||
Assert(slot->tts_tupleDescriptor->natts == nindexatts); |
||||
|
||||
ExecClearTuple(slot); |
||||
for (i = 0; i < nindexatts; i++) |
||||
values[i] = index_getattr(itup, i + 1, indexDesc, &isnull[i]); |
||||
ExecStoreVirtualTuple(slot); |
||||
} |
||||
|
||||
/*
|
||||
* index_descriptor_hack -- ugly kluge to make index's tupdesc OK for slot |
||||
* |
||||
* This is necessary because, alone among btree opclasses, name_ops uses |
||||
* a storage type (cstring) different from its input type. The index |
||||
* tuple descriptor will show "cstring", which is correct, but we have to |
||||
* expose "name" as the slot datatype or ExecEvalVar will whine. If we |
||||
* ever want to have any other cases with a different storage type, we ought |
||||
* to think of a cleaner solution than this. |
||||
*/ |
||||
static TupleDesc |
||||
index_descriptor_hack(Relation indexRel) |
||||
{ |
||||
TupleDesc tupdesc = RelationGetDescr(indexRel); |
||||
int i; |
||||
|
||||
/* copy so we can scribble on it safely */ |
||||
tupdesc = CreateTupleDescCopy(tupdesc); |
||||
|
||||
for (i = 0; i < tupdesc->natts; i++) |
||||
{ |
||||
if (indexRel->rd_opfamily[i] == NAME_BTREE_FAM_OID && |
||||
tupdesc->attrs[i]->atttypid == CSTRINGOID) |
||||
{ |
||||
tupdesc->attrs[i]->atttypid = NAMEOID; |
||||
|
||||
/*
|
||||
* We set attlen to match the type OID just in case anything looks |
||||
* at it. Note that this is safe only because StoreIndexTuple |
||||
* will insert the data as a virtual tuple, and we don't expect |
||||
* anything will try to materialize the scan tuple slot. |
||||
*/ |
||||
tupdesc->attrs[i]->attlen = NAMEDATALEN; |
||||
} |
||||
} |
||||
|
||||
return tupdesc; |
||||
} |
||||
|
||||
/*
|
||||
* IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual |
||||
* |
||||
* This can't really happen, since an index can't supply CTID which would |
||||
* be necessary data for any potential EvalPlanQual target relation. If it |
||||
* did happen, the EPQ code would pass us the wrong data, namely a heap |
||||
* tuple not an index tuple. So throw an error. |
||||
*/ |
||||
static bool |
||||
IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot) |
||||
{ |
||||
elog(ERROR, "EvalPlanQual recheck is not supported in index-only scans"); |
||||
return false; /* keep compiler quiet */ |
||||
} |
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecIndexOnlyScan(node) |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
TupleTableSlot * |
||||
ExecIndexOnlyScan(IndexOnlyScanState *node) |
||||
{ |
||||
/*
|
||||
* If we have runtime keys and they've not already been set up, do it now. |
||||
*/ |
||||
if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady) |
||||
ExecReScan((PlanState *) node); |
||||
|
||||
return ExecScan(&node->ss, |
||||
(ExecScanAccessMtd) IndexOnlyNext, |
||||
(ExecScanRecheckMtd) IndexOnlyRecheck); |
||||
} |
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecReScanIndexOnlyScan(node) |
||||
* |
||||
* Recalculates the values of any scan keys whose value depends on |
||||
* information known at runtime, then rescans the indexed relation. |
||||
* |
||||
* Updating the scan key was formerly done separately in |
||||
* ExecUpdateIndexScanKeys. Integrating it into ReScan makes |
||||
* rescans of indices and relations/general streams more uniform. |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
void |
||||
ExecReScanIndexOnlyScan(IndexOnlyScanState *node) |
||||
{ |
||||
/*
|
||||
* If we are doing runtime key calculations (ie, any of the index key |
||||
* values weren't simple Consts), compute the new key values. But first, |
||||
* reset the context so we don't leak memory as each outer tuple is |
||||
* scanned. Note this assumes that we will recalculate *all* runtime keys |
||||
* on each call. |
||||
*/ |
||||
if (node->ioss_NumRuntimeKeys != 0) |
||||
{ |
||||
ExprContext *econtext = node->ioss_RuntimeContext; |
||||
|
||||
ResetExprContext(econtext); |
||||
ExecIndexEvalRuntimeKeys(econtext, |
||||
node->ioss_RuntimeKeys, |
||||
node->ioss_NumRuntimeKeys); |
||||
} |
||||
node->ioss_RuntimeKeysReady = true; |
||||
|
||||
/* reset index scan */ |
||||
index_rescan(node->ioss_ScanDesc, |
||||
node->ioss_ScanKeys, node->ioss_NumScanKeys, |
||||
node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); |
||||
|
||||
ExecScanReScan(&node->ss); |
||||
} |
||||
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecEndIndexOnlyScan |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
void |
||||
ExecEndIndexOnlyScan(IndexOnlyScanState *node) |
||||
{ |
||||
Relation indexRelationDesc; |
||||
IndexScanDesc indexScanDesc; |
||||
Relation relation; |
||||
|
||||
/*
|
||||
* extract information from the node |
||||
*/ |
||||
indexRelationDesc = node->ioss_RelationDesc; |
||||
indexScanDesc = node->ioss_ScanDesc; |
||||
relation = node->ss.ss_currentRelation; |
||||
|
||||
/* Release VM buffer pin, if any. */ |
||||
if (node->ioss_VMBuffer != InvalidBuffer) |
||||
{ |
||||
ReleaseBuffer(node->ioss_VMBuffer); |
||||
node->ioss_VMBuffer = InvalidBuffer; |
||||
} |
||||
|
||||
/*
|
||||
* Free the exprcontext(s) ... now dead code, see ExecFreeExprContext |
||||
*/ |
||||
#ifdef NOT_USED |
||||
ExecFreeExprContext(&node->ss.ps); |
||||
if (node->ioss_RuntimeContext) |
||||
FreeExprContext(node->ioss_RuntimeContext, true); |
||||
#endif |
||||
|
||||
/*
|
||||
* clear out tuple table slots |
||||
*/ |
||||
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); |
||||
ExecClearTuple(node->ss.ss_ScanTupleSlot); |
||||
|
||||
/*
|
||||
* close the index relation (no-op if we didn't open it) |
||||
*/ |
||||
if (indexScanDesc) |
||||
index_endscan(indexScanDesc); |
||||
if (indexRelationDesc) |
||||
index_close(indexRelationDesc, NoLock); |
||||
|
||||
/*
|
||||
* close the heap relation. |
||||
*/ |
||||
ExecCloseScanRelation(relation); |
||||
} |
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecIndexOnlyMarkPos |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
void |
||||
ExecIndexOnlyMarkPos(IndexOnlyScanState *node) |
||||
{ |
||||
index_markpos(node->ioss_ScanDesc); |
||||
} |
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecIndexOnlyRestrPos |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
void |
||||
ExecIndexOnlyRestrPos(IndexOnlyScanState *node) |
||||
{ |
||||
index_restrpos(node->ioss_ScanDesc); |
||||
} |
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* ExecInitIndexOnlyScan |
||||
* |
||||
* Initializes the index scan's state information, creates |
||||
* scan keys, and opens the base and index relations. |
||||
* |
||||
* Note: index scans have 2 sets of state information because |
||||
* we have to keep track of the base relation and the |
||||
* index relation. |
||||
* ---------------------------------------------------------------- |
||||
*/ |
||||
IndexOnlyScanState * |
||||
ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) |
||||
{ |
||||
IndexOnlyScanState *indexstate; |
||||
Relation currentRelation; |
||||
bool relistarget; |
||||
TupleDesc tupDesc; |
||||
|
||||
/*
|
||||
* create state structure |
||||
*/ |
||||
indexstate = makeNode(IndexOnlyScanState); |
||||
indexstate->ss.ps.plan = (Plan *) node; |
||||
indexstate->ss.ps.state = estate; |
||||
|
||||
/*
|
||||
* Miscellaneous initialization |
||||
* |
||||
* create expression context for node |
||||
*/ |
||||
ExecAssignExprContext(estate, &indexstate->ss.ps); |
||||
|
||||
indexstate->ss.ps.ps_TupFromTlist = false; |
||||
|
||||
/*
|
||||
* initialize child expressions |
||||
* |
||||
* Note: we don't initialize all of the indexorderby expression, only the |
||||
* sub-parts corresponding to runtime keys (see below). |
||||
*/ |
||||
indexstate->ss.ps.targetlist = (List *) |
||||
ExecInitExpr((Expr *) node->scan.plan.targetlist, |
||||
(PlanState *) indexstate); |
||||
indexstate->ss.ps.qual = (List *) |
||||
ExecInitExpr((Expr *) node->scan.plan.qual, |
||||
(PlanState *) indexstate); |
||||
indexstate->indexqual = (List *) |
||||
ExecInitExpr((Expr *) node->indexqual, |
||||
(PlanState *) indexstate); |
||||
|
||||
/*
|
||||
* tuple table initialization |
||||
*/ |
||||
ExecInitResultTupleSlot(estate, &indexstate->ss.ps); |
||||
ExecInitScanTupleSlot(estate, &indexstate->ss); |
||||
|
||||
/*
|
||||
* open the base relation and acquire appropriate lock on it. |
||||
*/ |
||||
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid); |
||||
|
||||
indexstate->ss.ss_currentRelation = currentRelation; |
||||
indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ |
||||
|
||||
/*
|
||||
* Initialize result tuple type. |
||||
*/ |
||||
ExecAssignResultTypeFromTL(&indexstate->ss.ps); |
||||
|
||||
/*
|
||||
* If we are just doing EXPLAIN (ie, aren't going to run the plan), stop |
||||
* here. This allows an index-advisor plugin to EXPLAIN a plan containing |
||||
* references to nonexistent indexes. |
||||
*/ |
||||
if (eflags & EXEC_FLAG_EXPLAIN_ONLY) |
||||
return indexstate; |
||||
|
||||
/*
|
||||
* Open the index relation. |
||||
* |
||||
* If the parent table is one of the target relations of the query, then |
||||
* InitPlan already opened and write-locked the index, so we can avoid |
||||
* taking another lock here. Otherwise we need a normal reader's lock. |
||||
*/ |
||||
relistarget = ExecRelationIsTargetRelation(estate, node->scan.scanrelid); |
||||
indexstate->ioss_RelationDesc = index_open(node->indexid, |
||||
relistarget ? NoLock : AccessShareLock); |
||||
|
||||
/*
|
||||
* Now we can get the scan tuple's type (which is the index's rowtype, |
||||
* not the heap's) and initialize result projection info. |
||||
*/ |
||||
tupDesc = index_descriptor_hack(indexstate->ioss_RelationDesc); |
||||
ExecAssignScanType(&indexstate->ss, tupDesc); |
||||
ExecAssignScanProjectionInfo(&indexstate->ss); |
||||
|
||||
/*
|
||||
* Initialize index-specific scan state |
||||
*/ |
||||
indexstate->ioss_RuntimeKeysReady = false; |
||||
indexstate->ioss_RuntimeKeys = NULL; |
||||
indexstate->ioss_NumRuntimeKeys = 0; |
||||
|
||||
/*
|
||||
* build the index scan keys from the index qualification |
||||
*/ |
||||
ExecIndexBuildScanKeys((PlanState *) indexstate, |
||||
indexstate->ioss_RelationDesc, |
||||
node->indexqual, |
||||
false, |
||||
&indexstate->ioss_ScanKeys, |
||||
&indexstate->ioss_NumScanKeys, |
||||
&indexstate->ioss_RuntimeKeys, |
||||
&indexstate->ioss_NumRuntimeKeys, |
||||
NULL, /* no ArrayKeys */ |
||||
NULL); |
||||
|
||||
/*
|
||||
* any ORDER BY exprs have to be turned into scankeys in the same way |
||||
*/ |
||||
ExecIndexBuildScanKeys((PlanState *) indexstate, |
||||
indexstate->ioss_RelationDesc, |
||||
node->indexorderby, |
||||
true, |
||||
&indexstate->ioss_OrderByKeys, |
||||
&indexstate->ioss_NumOrderByKeys, |
||||
&indexstate->ioss_RuntimeKeys, |
||||
&indexstate->ioss_NumRuntimeKeys, |
||||
NULL, /* no ArrayKeys */ |
||||
NULL); |
||||
|
||||
/*
|
||||
* If we have runtime keys, we need an ExprContext to evaluate them. The |
||||
* node's standard context won't do because we want to reset that context |
||||
* for every tuple. So, build another context just like the other one... |
||||
* -tgl 7/11/00 |
||||
*/ |
||||
if (indexstate->ioss_NumRuntimeKeys != 0) |
||||
{ |
||||
ExprContext *stdecontext = indexstate->ss.ps.ps_ExprContext; |
||||
|
||||
ExecAssignExprContext(estate, &indexstate->ss.ps); |
||||
indexstate->ioss_RuntimeContext = indexstate->ss.ps.ps_ExprContext; |
||||
indexstate->ss.ps.ps_ExprContext = stdecontext; |
||||
} |
||||
else |
||||
{ |
||||
indexstate->ioss_RuntimeContext = NULL; |
||||
} |
||||
|
||||
/*
|
||||
* Initialize scan descriptor. |
||||
*/ |
||||
indexstate->ioss_ScanDesc = index_beginscan(currentRelation, |
||||
indexstate->ioss_RelationDesc, |
||||
estate->es_snapshot, |
||||
indexstate->ioss_NumScanKeys, |
||||
indexstate->ioss_NumOrderByKeys); |
||||
|
||||
/* Set it up for index-only scan */ |
||||
indexstate->ioss_ScanDesc->xs_want_itup = true; |
||||
indexstate->ioss_VMBuffer = InvalidBuffer; |
||||
|
||||
/*
|
||||
* If no run-time keys to calculate, go ahead and pass the scankeys to the |
||||
* index AM. |
||||
*/ |
||||
if (indexstate->ioss_NumRuntimeKeys == 0) |
||||
index_rescan(indexstate->ioss_ScanDesc, |
||||
indexstate->ioss_ScanKeys, |
||||
indexstate->ioss_NumScanKeys, |
||||
indexstate->ioss_OrderByKeys, |
||||
indexstate->ioss_NumOrderByKeys); |
||||
|
||||
/*
|
||||
* all done. |
||||
*/ |
||||
return indexstate; |
||||
} |
@ -0,0 +1,26 @@ |
||||
/*-------------------------------------------------------------------------
|
||||
* |
||||
* nodeIndexonlyscan.h |
||||
* |
||||
* |
||||
* |
||||
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group |
||||
* Portions Copyright (c) 1994, Regents of the University of California |
||||
* |
||||
* src/include/executor/nodeIndexonlyscan.h |
||||
* |
||||
*------------------------------------------------------------------------- |
||||
*/ |
||||
#ifndef NODEINDEXONLYSCAN_H |
||||
#define NODEINDEXONLYSCAN_H |
||||
|
||||
#include "nodes/execnodes.h" |
||||
|
||||
extern IndexOnlyScanState *ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags); |
||||
extern TupleTableSlot *ExecIndexOnlyScan(IndexOnlyScanState *node); |
||||
extern void ExecEndIndexOnlyScan(IndexOnlyScanState *node); |
||||
extern void ExecIndexOnlyMarkPos(IndexOnlyScanState *node); |
||||
extern void ExecIndexOnlyRestrPos(IndexOnlyScanState *node); |
||||
extern void ExecReScanIndexOnlyScan(IndexOnlyScanState *node); |
||||
|
||||
#endif /* NODEINDEXONLYSCAN_H */ |
Loading…
Reference in new issue