|
|
|
|
@ -7,90 +7,117 @@ |
|
|
|
|
* Copyright (c) 2001-2006, PostgreSQL Global Development Group |
|
|
|
|
* |
|
|
|
|
* IDENTIFICATION |
|
|
|
|
* $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $ |
|
|
|
|
* $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $ |
|
|
|
|
* |
|
|
|
|
*------------------------------------------------------------------------- |
|
|
|
|
*/ |
|
|
|
|
#include "postgres.h" |
|
|
|
|
|
|
|
|
|
#include <unistd.h> |
|
|
|
|
#include <math.h> |
|
|
|
|
#include <unistd.h> |
|
|
|
|
|
|
|
|
|
#include "executor/instrument.h" |
|
|
|
|
|
|
|
|
|
/* This is the function that is used to determine the sampling intervals. In
|
|
|
|
|
* general, if the function is f(x), then for N tuples we will take on the |
|
|
|
|
* order of integral(1/f(x), x=0..N) samples. Some examples follow, with the |
|
|
|
|
* number of samples that would be collected over 1,000,000 tuples. |
|
|
|
|
|
|
|
|
|
f(x) = x => log2(N) 20 |
|
|
|
|
f(x) = x^(1/2) => 2 * N^(1/2) 2000 |
|
|
|
|
f(x) = x^(1/3) => 1.5 * N^(2/3) 15000 |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE |
|
|
|
|
* by not calling INSTR_TIME_SET_CURRENT() for every single node execution. |
|
|
|
|
* (Because that requires a kernel call on most systems, it's expensive.) |
|
|
|
|
* |
|
|
|
|
* This macro determines the sampling interval: that is, after how many more |
|
|
|
|
* iterations will we take the next time sample, given that niters iterations |
|
|
|
|
* have occurred already. In general, if the function is f(x), then for N |
|
|
|
|
* iterations we will take on the order of integral(1/f(x), x=0..N) |
|
|
|
|
* samples. Some examples follow, with the number of samples that would be |
|
|
|
|
* collected over 1,000,000 iterations: |
|
|
|
|
* |
|
|
|
|
* f(x) = x => log2(N) 20 |
|
|
|
|
* f(x) = x^(1/2) => 2 * N^(1/2) 2000 |
|
|
|
|
* f(x) = x^(1/3) => 1.5 * N^(2/3) 15000 |
|
|
|
|
* |
|
|
|
|
* I've chosen the last one as it seems to provide a good compromise between |
|
|
|
|
* low overhead but still getting a meaningful number of samples. However, |
|
|
|
|
* not all machines have the cbrt() function so on those we substitute |
|
|
|
|
* sqrt(). The difference is not very significant in the tests I made. |
|
|
|
|
*/
|
|
|
|
|
* |
|
|
|
|
* The actual sampling interval is randomized with the SampleFunc() value |
|
|
|
|
* as the mean; this hopefully will reduce any measurement bias due to |
|
|
|
|
* cyclic variation in the node execution time. |
|
|
|
|
*/ |
|
|
|
|
#ifdef HAVE_CBRT |
|
|
|
|
#define SampleFunc cbrt |
|
|
|
|
#define SampleFunc(niters) cbrt(niters) |
|
|
|
|
#else |
|
|
|
|
#define SampleFunc sqrt |
|
|
|
|
#define SampleFunc(niters) sqrt(niters) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
#define SampleInterval(niters) \ |
|
|
|
|
(SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2)) |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We sample at every node iteration until we've reached this threshold, |
|
|
|
|
* so that nodes not called a large number of times are completely accurate. |
|
|
|
|
* (Perhaps this should be a GUC variable? But beware of messing up |
|
|
|
|
* CalculateSampleOverhead if value is too small.) |
|
|
|
|
*/ |
|
|
|
|
#define SAMPLE_THRESHOLD 50 |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Determine the sampling overhead. This only needs to be done once per |
|
|
|
|
* backend (actually, probably once per postmaster would do ...) |
|
|
|
|
*/ |
|
|
|
|
static double SampleOverhead; |
|
|
|
|
static bool SampleOverheadCalculated; |
|
|
|
|
static bool SampleOverheadCalculated = false; |
|
|
|
|
|
|
|
|
|
static void |
|
|
|
|
CalculateSampleOverhead() |
|
|
|
|
CalculateSampleOverhead(void) |
|
|
|
|
{ |
|
|
|
|
Instrumentation instr; |
|
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
/* We want to determine the sampling overhead, to correct
|
|
|
|
|
* calculations later. This only needs to be done once per backend. |
|
|
|
|
* Is this the place? A wrong value here (due to a mistimed |
|
|
|
|
* task-switch) will cause bad calculations later. |
|
|
|
|
* |
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We could get a wrong result due to being interrupted by task switch. |
|
|
|
|
* To minimize the risk we do it a few times and take the lowest. |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
SampleOverhead = 1.0e6; |
|
|
|
|
|
|
|
|
|
for( i = 0; i<5; i++ ) |
|
|
|
|
|
|
|
|
|
for (i = 0; i < 5; i++) |
|
|
|
|
{ |
|
|
|
|
int j; |
|
|
|
|
double overhead; |
|
|
|
|
|
|
|
|
|
memset( &instr, 0, sizeof(instr) ); |
|
|
|
|
|
|
|
|
|
/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */ |
|
|
|
|
for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ ) |
|
|
|
|
|
|
|
|
|
memset(&instr, 0, sizeof(instr)); |
|
|
|
|
/*
|
|
|
|
|
* We know that samples will actually be taken up to SAMPLE_THRESHOLD, |
|
|
|
|
* so that's as far as we can test. |
|
|
|
|
*/ |
|
|
|
|
for (j=0; j < SAMPLE_THRESHOLD; j++) |
|
|
|
|
{ |
|
|
|
|
InstrStartNode( &instr ); |
|
|
|
|
InstrStopNode( &instr, 1 ); |
|
|
|
|
InstrStartNode(&instr); |
|
|
|
|
InstrStopNode(&instr, 1); |
|
|
|
|
} |
|
|
|
|
overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount; |
|
|
|
|
if( overhead < SampleOverhead ) |
|
|
|
|
if (overhead < SampleOverhead) |
|
|
|
|
SampleOverhead = overhead; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SampleOverheadCalculated = true; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Allocate new instrumentation structure(s) */ |
|
|
|
|
Instrumentation * |
|
|
|
|
InstrAlloc(int n) |
|
|
|
|
{ |
|
|
|
|
Instrumentation *instr = palloc0(n * sizeof(Instrumentation)); |
|
|
|
|
Instrumentation *instr; |
|
|
|
|
|
|
|
|
|
/* we don't need to do any initialization except zero 'em */ |
|
|
|
|
|
|
|
|
|
/* Calculate overhead, if not done yet */ |
|
|
|
|
if( !SampleOverheadCalculated ) |
|
|
|
|
/* Calculate sampling overhead, if not done yet in this backend */ |
|
|
|
|
if (!SampleOverheadCalculated) |
|
|
|
|
CalculateSampleOverhead(); |
|
|
|
|
|
|
|
|
|
instr = palloc0(n * sizeof(Instrumentation)); |
|
|
|
|
|
|
|
|
|
/* we don't need to do any initialization except zero 'em */ |
|
|
|
|
|
|
|
|
|
return instr; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr) |
|
|
|
|
{ |
|
|
|
|
if (INSTR_TIME_IS_ZERO(instr->starttime)) |
|
|
|
|
{ |
|
|
|
|
/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */ |
|
|
|
|
if (instr->tuplecount < SAMPLE_THRESHOLD) |
|
|
|
|
/*
|
|
|
|
|
* Always sample if not yet up to threshold, else check whether |
|
|
|
|
* next threshold has been reached |
|
|
|
|
*/ |
|
|
|
|
if (instr->itercount < SAMPLE_THRESHOLD) |
|
|
|
|
instr->sampling = true; |
|
|
|
|
else |
|
|
|
|
else if (instr->itercount >= instr->nextsample) |
|
|
|
|
{ |
|
|
|
|
/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */ |
|
|
|
|
if( instr->tuplecount > instr->nextsample ) |
|
|
|
|
{ |
|
|
|
|
instr->sampling = true; |
|
|
|
|
/* The doubling is so the random will average 1 over time */ |
|
|
|
|
instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX; |
|
|
|
|
} |
|
|
|
|
instr->sampling = true; |
|
|
|
|
instr->nextsample = |
|
|
|
|
instr->itercount + SampleInterval(instr->itercount); |
|
|
|
|
} |
|
|
|
|
if (instr->sampling) |
|
|
|
|
INSTR_TIME_SET_CURRENT(instr->starttime); |
|
|
|
|
@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr) |
|
|
|
|
void |
|
|
|
|
InstrStopNode(Instrumentation *instr, double nTuples) |
|
|
|
|
{ |
|
|
|
|
instr_time endtime; |
|
|
|
|
|
|
|
|
|
/* count the returned tuples */ |
|
|
|
|
/* count the returned tuples and iterations */ |
|
|
|
|
instr->tuplecount += nTuples; |
|
|
|
|
instr->itercount += 1; |
|
|
|
|
|
|
|
|
|
/* measure runtime if appropriate */ |
|
|
|
|
if (instr->sampling) |
|
|
|
|
{ |
|
|
|
|
instr_time endtime; |
|
|
|
|
|
|
|
|
|
if (INSTR_TIME_IS_ZERO(instr->starttime)) |
|
|
|
|
{ |
|
|
|
|
elog(DEBUG2, "InstrStopNode called without start"); |
|
|
|
|
@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples) |
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
INSTR_TIME_SET_ZERO(instr->starttime); |
|
|
|
|
instr->samplecount += nTuples; |
|
|
|
|
|
|
|
|
|
instr->samplecount += 1; |
|
|
|
|
instr->sampling = false; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr) |
|
|
|
|
if (!INSTR_TIME_IS_ZERO(instr->starttime)) |
|
|
|
|
elog(DEBUG2, "InstrEndLoop called on running node"); |
|
|
|
|
|
|
|
|
|
/* Accumulate per-cycle statistics into totals */ |
|
|
|
|
/* Compute time spent in node */ |
|
|
|
|
totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); |
|
|
|
|
|
|
|
|
|
instr->startup += instr->firsttuple; |
|
|
|
|
|
|
|
|
|
/* Here we take into account sampling effects. Doing it naively ends
|
|
|
|
|
* up assuming the sampling overhead applies to all tuples, even the |
|
|
|
|
* ones we didn't measure. We've calculated an overhead, so we |
|
|
|
|
* subtract that for all samples we didn't measure. The first tuple |
|
|
|
|
* is also special cased, because it usually takes longer. */ |
|
|
|
|
|
|
|
|
|
if( instr->samplecount < instr->tuplecount ) |
|
|
|
|
/*
|
|
|
|
|
* If we didn't measure runtime on every iteration, then we must increase |
|
|
|
|
* the measured total to account for the other iterations. Naively |
|
|
|
|
* multiplying totaltime by itercount/samplecount would be wrong because |
|
|
|
|
* it effectively assumes the sampling overhead applies to all iterations, |
|
|
|
|
* even the ones we didn't measure. (Note that what we are trying to |
|
|
|
|
* estimate here is the actual time spent in the node, including the |
|
|
|
|
* actual measurement overhead; not the time exclusive of measurement |
|
|
|
|
* overhead.) We exclude the first iteration from the correction basis, |
|
|
|
|
* because it often takes longer than others. |
|
|
|
|
*/ |
|
|
|
|
if (instr->itercount > instr->samplecount) |
|
|
|
|
{ |
|
|
|
|
double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1); |
|
|
|
|
instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1)) |
|
|
|
|
+ ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount)); |
|
|
|
|
double per_iter; |
|
|
|
|
|
|
|
|
|
per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1) |
|
|
|
|
- SampleOverhead; |
|
|
|
|
if (per_iter > 0) /* sanity check */ |
|
|
|
|
totaltime += per_iter * (instr->itercount - instr->samplecount); |
|
|
|
|
} |
|
|
|
|
else |
|
|
|
|
instr->total += totaltime; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Accumulate per-cycle statistics into totals */ |
|
|
|
|
instr->startup += instr->firsttuple; |
|
|
|
|
instr->total += totaltime; |
|
|
|
|
instr->ntuples += instr->tuplecount; |
|
|
|
|
instr->nsamples += instr->samplecount; |
|
|
|
|
instr->nloops += 1; |
|
|
|
|
|
|
|
|
|
/* Reset for next cycle (if any) */ |
|
|
|
|
instr->running = false; |
|
|
|
|
instr->sampling = false; |
|
|
|
|
INSTR_TIME_SET_ZERO(instr->starttime); |
|
|
|
|
INSTR_TIME_SET_ZERO(instr->counter); |
|
|
|
|
instr->firsttuple = 0; |
|
|
|
|
instr->samplecount = 0; |
|
|
|
|
instr->tuplecount = 0; |
|
|
|
|
instr->itercount = 0; |
|
|
|
|
instr->samplecount = 0; |
|
|
|
|
instr->nextsample = 0; |
|
|
|
|
} |
|
|
|
|
|