Code review for EXPLAIN patch. Fix some typos, make it behave sanely

across multiple loops, get rid of the shaky assumption that exactly one tuple is returned per node iteration.
20 years ago · a18ebc5541
parent 3993bd8e07
commit a18ebc5541
2 changed files with 113 additions and 76 deletions
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@ -7,90 +7,117 @@
 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

-#include <unistd.h>
 #include <math.h>
+#include <unistd.h>

 #include "executor/instrument.h"

-/* This is the function that is used to determine the sampling intervals. In
- * general, if the function is f(x), then for N tuples we will take on the
- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
- * number of samples that would be collected over 1,000,000 tuples.

-  f(x) = x         =>   log2(N)                        20
-  f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
-  f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
-  
+/*
+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
+ * (Because that requires a kernel call on most systems, it's expensive.)
+ *
+ * This macro determines the sampling interval: that is, after how many more
+ * iterations will we take the next time sample, given that niters iterations
+ * have occurred already.  In general, if the function is f(x), then for N
+ * iterations we will take on the order of integral(1/f(x), x=0..N)
+ * samples.  Some examples follow, with the number of samples that would be
+ * collected over 1,000,000 iterations:
+ *
+ *		f(x) = x         =>   log2(N)                        20
+ *		f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
+ *		f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
+ *
 * I've chosen the last one as it seems to provide a good compromise between
 * low overhead but still getting a meaningful number of samples. However,
 * not all machines have the cbrt() function so on those we substitute
 * sqrt(). The difference is not very significant in the tests I made.
-*/ 
+ *
+ * The actual sampling interval is randomized with the SampleFunc() value
+ * as the mean; this hopefully will reduce any measurement bias due to
+ * cyclic variation in the node execution time.
+ */
 #ifdef HAVE_CBRT
-#define SampleFunc cbrt
+#define SampleFunc(niters) cbrt(niters)
 #else
-#define SampleFunc sqrt
+#define SampleFunc(niters) sqrt(niters)
 #endif

+#define SampleInterval(niters) \
+	(SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
+
+/*
+ * We sample at every node iteration until we've reached this threshold,
+ * so that nodes not called a large number of times are completely accurate.
+ * (Perhaps this should be a GUC variable?  But beware of messing up
+ * CalculateSampleOverhead if value is too small.)
+ */
 #define SAMPLE_THRESHOLD 50

+/*
+ * Determine the sampling overhead.  This only needs to be done once per
+ * backend (actually, probably once per postmaster would do ...)
+ */
 static double SampleOverhead;
-static bool SampleOverheadCalculated;
+static bool SampleOverheadCalculated = false;

 static void
-CalculateSampleOverhead()
+CalculateSampleOverhead(void)
 {
 	Instrumentation instr;
 	int i;
-	
-	/* We want to determine the sampling overhead, to correct
-	 * calculations later. This only needs to be done once per backend.
-	 * Is this the place? A wrong value here (due to a mistimed
-	 * task-switch) will cause bad calculations later.
-	 *
+
+	/*
+	 * We could get a wrong result due to being interrupted by task switch.
 	 * To minimize the risk we do it a few times and take the lowest.
 	 */
-	
 	SampleOverhead = 1.0e6;
-	
-	for( i = 0; i<5; i++ )
+
+	for (i = 0; i < 5; i++)
 	{
 		int j;
 		double overhead;
-		
-		memset( &instr, 0, sizeof(instr) );
-	
-		/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
-		for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ )
+
+		memset(&instr, 0, sizeof(instr));
+		/*
+		 * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
+		 * so that's as far as we can test.
+		 */
+		for (j=0; j < SAMPLE_THRESHOLD; j++)
 		{
-			InstrStartNode( &instr );
-			InstrStopNode( &instr, 1 );
+			InstrStartNode(&instr);
+			InstrStopNode(&instr, 1);
 		}
 		overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
-		if( overhead < SampleOverhead )
+		if (overhead < SampleOverhead)
 			SampleOverhead = overhead;
 	}
-		
+
 	SampleOverheadCalculated = true;
 }

+
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
 InstrAlloc(int n)
 {
-	Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
+	Instrumentation *instr;

-	/* we don't need to do any initialization except zero 'em */
-	
-	/* Calculate overhead, if not done yet */
-	if( !SampleOverheadCalculated )
+	/* Calculate sampling overhead, if not done yet in this backend */
+	if (!SampleOverheadCalculated)
 		CalculateSampleOverhead();
+
+	instr = palloc0(n * sizeof(Instrumentation));
+
+	/* we don't need to do any initialization except zero 'em */
+
 	return instr;
 }

@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
 {
 	if (INSTR_TIME_IS_ZERO(instr->starttime))
 	{
-		/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
-		if (instr->tuplecount < SAMPLE_THRESHOLD)
+		/*
+		 * Always sample if not yet up to threshold, else check whether
+		 * next threshold has been reached
+		 */
+		if (instr->itercount < SAMPLE_THRESHOLD)
 			instr->sampling = true;
-		else
+		else if (instr->itercount >= instr->nextsample)
 		{
-			/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
-			if( instr->tuplecount > instr->nextsample )
-			{
-				instr->sampling = true;
-				/* The doubling is so the random will average 1 over time */
-				instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
-			}
+			instr->sampling = true;
+			instr->nextsample =
+				instr->itercount + SampleInterval(instr->itercount);
 		}
 		if (instr->sampling)
 			INSTR_TIME_SET_CURRENT(instr->starttime);
@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
 void
 InstrStopNode(Instrumentation *instr, double nTuples)
 {
-	instr_time	endtime;
-
-	/* count the returned tuples */
+	/* count the returned tuples and iterations */
 	instr->tuplecount += nTuples;
+	instr->itercount += 1;

+	/* measure runtime if appropriate */
 	if (instr->sampling)
 	{
+		instr_time	endtime;
+
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 		{
 			elog(DEBUG2, "InstrStopNode called without start");
@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 #endif

 		INSTR_TIME_SET_ZERO(instr->starttime);
-		instr->samplecount += nTuples;
+
+		instr->samplecount += 1;
 		instr->sampling = false;
 	}

@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
 	if (!INSTR_TIME_IS_ZERO(instr->starttime))
 		elog(DEBUG2, "InstrEndLoop called on running node");

-	/* Accumulate per-cycle statistics into totals */
+	/* Compute time spent in node */
 	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);

-	instr->startup += instr->firsttuple;
-
-	/* Here we take into account sampling effects. Doing it naively ends
-	 * up assuming the sampling overhead applies to all tuples, even the
-	 * ones we didn't measure. We've calculated an overhead, so we
-	 * subtract that for all samples we didn't measure. The first tuple
-	 * is also special cased, because it usually takes longer. */
-
-	if( instr->samplecount < instr->tuplecount )
+	/*
+	 * If we didn't measure runtime on every iteration, then we must increase
+	 * the measured total to account for the other iterations.  Naively
+	 * multiplying totaltime by itercount/samplecount would be wrong because
+	 * it effectively assumes the sampling overhead applies to all iterations,
+	 * even the ones we didn't measure.  (Note that what we are trying to
+	 * estimate here is the actual time spent in the node, including the
+	 * actual measurement overhead; not the time exclusive of measurement
+	 * overhead.)  We exclude the first iteration from the correction basis,
+	 * because it often takes longer than others.
+	 */
+	if (instr->itercount > instr->samplecount)
 	{
-		double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
-		instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
-		                                  + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+		double per_iter;
+
+		per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
+			- SampleOverhead;
+		if (per_iter > 0)				/* sanity check */
+			totaltime += per_iter * (instr->itercount - instr->samplecount);
 	}
-	else
-		instr->total += totaltime;
-		
+
+	/* Accumulate per-cycle statistics into totals */
+	instr->startup += instr->firsttuple;
+	instr->total += totaltime;
 	instr->ntuples += instr->tuplecount;
-	instr->nsamples += instr->samplecount;
 	instr->nloops += 1;

 	/* Reset for next cycle (if any) */
 	instr->running = false;
+	instr->sampling = false;
 	INSTR_TIME_SET_ZERO(instr->starttime);
 	INSTR_TIME_SET_ZERO(instr->counter);
 	instr->firsttuple = 0;
-	instr->samplecount = 0;
 	instr->tuplecount = 0;
+	instr->itercount = 0;
+	instr->samplecount = 0;
+	instr->nextsample = 0;
 }
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@ -6,7 +6,7 @@
 *
 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -57,20 +57,19 @@ typedef struct Instrumentation
 {
 	/* Info about current plan cycle: */
 	bool		running;		/* TRUE if we've completed first tuple */
+	bool		sampling;		/* Are we sampling in current iteration? */
 	instr_time	starttime;		/* Start time of current iteration of node */
 	instr_time	counter;		/* Accumulated runtime for this node */
 	double		firsttuple;		/* Time for first tuple of this cycle */
 	double		tuplecount;		/* Tuples emitted so far this cycle */
-	double		samplecount;		/* Samples collected this cycle */
+	double		itercount;		/* Plan node iterations this cycle */
+	double		samplecount;	/* Iterations in which we sampled runtime */
+	double		nextsample;		/* Next itercount to sample at */
 	/* Accumulated statistics across all completed cycles: */
 	double		startup;		/* Total startup time (in seconds) */
 	double		total;			/* Total total time (in seconds) */
 	double		ntuples;		/* Total tuples produced */
 	double		nloops;			/* # of run cycles for this node */
-	double		nsamples;		/* # of samples taken */
-	/* Tracking for sampling */
-	bool		sampling;		/* Are we sampling this iteration */
-	double		nextsample;		/* The next tuplecount we're going to sample */
 } Instrumentation;

 extern Instrumentation *InstrAlloc(int n);