mirror of https://github.com/postgres/postgres
Ants Aasma, Greg Smithpull/3/head
parent
5b4f346611
commit
cee523867d
@ -0,0 +1 @@ |
||||
/pg_test_timing |
||||
@ -0,0 +1,18 @@ |
||||
# contrib/pg_test_timing/Makefile
|
||||
|
||||
PGFILEDESC = "pg_test_timing - test timing overhead"
|
||||
PGAPPICON = win32
|
||||
|
||||
PROGRAM = pg_test_timing
|
||||
OBJS = pg_test_timing.o
|
||||
|
||||
ifdef USE_PGXS |
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS) |
||||
else |
||||
subdir = contrib/pg_test_timing
|
||||
top_builddir = ../..
|
||||
include $(top_builddir)/src/Makefile.global |
||||
include $(top_srcdir)/contrib/contrib-global.mk |
||||
endif |
||||
@ -0,0 +1,162 @@ |
||||
/*
|
||||
* pg_test_timing.c |
||||
* tests overhead of timing calls and their monotonicity: that |
||||
* they always move forward |
||||
*/ |
||||
|
||||
#include "postgres_fe.h" |
||||
|
||||
#include "getopt_long.h" |
||||
#include "portability/instr_time.h" |
||||
|
||||
static const char *progname; |
||||
|
||||
static int32 test_duration = 3; |
||||
|
||||
static void handle_args(int argc, char *argv[]); |
||||
static void test_timing(int32); |
||||
|
||||
int |
||||
main(int argc, char *argv[]) |
||||
{ |
||||
progname = get_progname(argv[0]); |
||||
|
||||
handle_args(argc, argv); |
||||
|
||||
test_timing(test_duration); |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
static void |
||||
handle_args(int argc, char *argv[]) |
||||
{ |
||||
static struct option long_options[] = { |
||||
{"duration", required_argument, NULL, 'd'}, |
||||
{NULL, 0, NULL, 0} |
||||
}; |
||||
int option; /* Command line option */ |
||||
int optindex = 0; /* used by getopt_long */ |
||||
|
||||
if (argc > 1) |
||||
{ |
||||
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0 || |
||||
strcmp(argv[1], "-?") == 0) |
||||
{ |
||||
printf("Usage: %s [-d DURATION]\n", progname); |
||||
exit(0); |
||||
} |
||||
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) |
||||
{ |
||||
puts("pg_test_timing (PostgreSQL) " PG_VERSION); |
||||
exit(0); |
||||
} |
||||
} |
||||
|
||||
while ((option = getopt_long(argc, argv, "d:", |
||||
long_options, &optindex)) != -1) |
||||
{ |
||||
switch (option) |
||||
{ |
||||
case 'd': |
||||
test_duration = atoi(optarg); |
||||
break; |
||||
|
||||
default: |
||||
fprintf(stderr, "Try \"%s --help\" for more information.\n", |
||||
progname); |
||||
exit(1); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
if (argc > optind) |
||||
{ |
||||
fprintf(stderr, |
||||
"%s: too many command-line arguments (first is \"%s\")\n", |
||||
progname, argv[optind]); |
||||
fprintf(stderr, "Try \"%s --help\" for more information.\n", |
||||
progname); |
||||
exit(1); |
||||
} |
||||
|
||||
if (test_duration > 0) |
||||
{ |
||||
printf("Testing timing overhead for %d seconds.\n", test_duration); |
||||
} |
||||
else |
||||
{ |
||||
fprintf(stderr, |
||||
"%s: duration must be a positive integer (duration is \"%d\")\n", |
||||
progname, test_duration); |
||||
fprintf(stderr, "Try \"%s --help\" for more information.\n", |
||||
progname); |
||||
exit(1); |
||||
} |
||||
} |
||||
|
||||
static void |
||||
test_timing(int32 duration) |
||||
{ |
||||
uint64 total_time; |
||||
int64 time_elapsed = 0; |
||||
uint64 loop_count = 0; |
||||
uint64 prev, cur; |
||||
int32 diff, i, bits, found; |
||||
|
||||
instr_time start_time, end_time, temp; |
||||
|
||||
static int64 histogram[32]; |
||||
|
||||
total_time = duration > 0 ? duration * 1000000 : 0; |
||||
|
||||
INSTR_TIME_SET_CURRENT(start_time); |
||||
cur = INSTR_TIME_GET_MICROSEC(start_time); |
||||
|
||||
while (time_elapsed < total_time) |
||||
{ |
||||
prev = cur; |
||||
INSTR_TIME_SET_CURRENT(temp); |
||||
cur = INSTR_TIME_GET_MICROSEC(temp); |
||||
diff = cur - prev; |
||||
|
||||
if (diff < 0) |
||||
{ |
||||
printf("Detected clock going backwards in time.\n"); |
||||
printf("Time warp: %d microseconds\n", diff); |
||||
exit(1); |
||||
} |
||||
|
||||
bits = 0; |
||||
while (diff) |
||||
{ |
||||
diff >>= 1; |
||||
bits++; |
||||
} |
||||
histogram[bits]++; |
||||
|
||||
loop_count++; |
||||
INSTR_TIME_SUBTRACT(temp, start_time); |
||||
time_elapsed = INSTR_TIME_GET_MICROSEC(temp); |
||||
} |
||||
|
||||
INSTR_TIME_SET_CURRENT(end_time); |
||||
|
||||
INSTR_TIME_SUBTRACT(end_time, start_time); |
||||
|
||||
printf("Per loop time including overhead: %0.2f nsec\n", |
||||
INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count); |
||||
printf("Histogram of timing durations:\n"); |
||||
printf("%9s: %10s %9s\n", "< usec", "count", "percent"); |
||||
|
||||
found = 0; |
||||
for (i = 31; i >= 0; i--) |
||||
{ |
||||
if (found || histogram[i]) |
||||
{ |
||||
found = 1; |
||||
printf("%9ld: %10ld %8.5f%%\n", 1l << i, histogram[i], |
||||
(double) histogram[i] * 100 / loop_count); |
||||
} |
||||
} |
||||
} |
||||
@ -0,0 +1,261 @@ |
||||
<!-- doc/src/sgml/pgtesttiming.sgml --> |
||||
|
||||
<sect1 id="pgtesttiming" xreflabel="pg_test_timing"> |
||||
<title>pg_test_timing</title> |
||||
|
||||
<indexterm zone="pgtesttiming"> |
||||
<primary>pg_test_timing</primary> |
||||
</indexterm> |
||||
|
||||
<para> |
||||
<application>pg_test_timing</> is a tool to measure the timing overhead |
||||
on your system and confirm that the system time never moves backwards. |
||||
Systems that are slow to collect timing data can give less accurate |
||||
<command>EXPLAIN ANALYZE</command> results. |
||||
</para> |
||||
|
||||
<sect2> |
||||
<title>Usage</title> |
||||
|
||||
<synopsis> |
||||
pg_test_timing [options] |
||||
</synopsis> |
||||
|
||||
<para> |
||||
<application>pg_test_timing</application> accepts the following |
||||
command-line options: |
||||
|
||||
<variablelist> |
||||
|
||||
<varlistentry> |
||||
<term><option>-d</option></term> |
||||
<term><option>--duration</option></term> |
||||
<listitem> |
||||
<para> |
||||
Specifies the test duration, in seconds. Longer durations |
||||
give slightly better accuracy, and are more likely to discover |
||||
problems with the system clock moving backwards. The default |
||||
test duration is 3 seconds. |
||||
</para> |
||||
</listitem> |
||||
</varlistentry> |
||||
|
||||
</variablelist> |
||||
</para> |
||||
|
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Interpreting results</title> |
||||
|
||||
<para> |
||||
Good results will show most (>90%) individual timing calls take less |
||||
than one microsecond. Average per loop overhead will be even lower, |
||||
below 100 nanoseconds. This example from an Intel i7-860 system using |
||||
a TSC clock source shows excellent performance: |
||||
</para> |
||||
|
||||
<screen> |
||||
Testing timing overhead for 3 seconds. |
||||
Per loop time including overhead: 35.96 nsec |
||||
Histogram of timing durations: |
||||
< usec: count percent |
||||
16: 2 0.00000% |
||||
8: 13 0.00002% |
||||
4: 126 0.00015% |
||||
2: 2999652 3.59518% |
||||
1: 80435604 96.40465% |
||||
</screen> |
||||
|
||||
<para> |
||||
Note that different units are used for the per loop time than the |
||||
histogram. The loop can have resolution within a few nanoseconds |
||||
(nsec), while the individual timing calls can only resolve down to |
||||
one microsecond (usec). |
||||
</para> |
||||
|
||||
</sect2> |
||||
<sect2> |
||||
<title>Measuring executor timing overhead</title> |
||||
|
||||
<para> |
||||
When the query executor is running a statement using |
||||
<command>EXPLAIN ANALYZE</command>, individual operations are |
||||
timed as well as showing a summary. The overhead of your system |
||||
can be checked by counting rows with the psql program: |
||||
</para> |
||||
|
||||
<screen> |
||||
CREATE TABLE t AS SELECT * FROM generate_series(1,100000); |
||||
\timing |
||||
SELECT COUNT(*) FROM t; |
||||
EXPLAIN ANALYZE SELECT COUNT(*) FROM t; |
||||
</screen> |
||||
|
||||
<para> |
||||
The i7-860 system measured runs the count query in 9.8 ms while |
||||
the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms, |
||||
each processing just over 100,000 rows. That 6.8 ms difference |
||||
means the timing overhead per row is 68 ns, about twice what |
||||
pg_test_timing estimated it would be. Even that relatively |
||||
small amount of overhead is making the fully timed count statement |
||||
take almost 70% longer. On more substantial queries, the |
||||
timing overhead would be less problematic. |
||||
</para> |
||||
|
||||
</sect2> |
||||
<sect2> |
||||
<title>Changing time sources</title> |
||||
<para> |
||||
On some newer Linux systems, it's possible to change the clock |
||||
source used to collect timing data at any time. A second example |
||||
shows the slowdown possible from switching to the slower acpi_pm |
||||
time source, on the same system used for the fast results above: |
||||
</para> |
||||
|
||||
<screen> |
||||
# cat /sys/devices/system/clocksource/clocksource0/available_clocksource |
||||
tsc hpet acpi_pm |
||||
# echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource |
||||
# pg_test_timing |
||||
Per loop time including overhead: 722.92 nsec |
||||
Histogram of timing durations: |
||||
< usec: count percent |
||||
16: 3 0.00007% |
||||
8: 563 0.01357% |
||||
4: 3241 0.07810% |
||||
2: 2990371 72.05956% |
||||
1: 1155682 27.84870% |
||||
</screen> |
||||
|
||||
<para> |
||||
In this configuration, the sample <command>EXPLAIN ANALYZE</command> |
||||
above takes 115.9 ms. That's 1061 nsec of timing overhead, again |
||||
a small multiple of what's measured directly by this utility. |
||||
That much timing overhead means the actual query itself is only |
||||
taking a tiny fraction of the accounted for time, most of it |
||||
is being consumed in overhead instead. In this configuration, |
||||
any <command>EXPLAIN ANALYZE</command> totals involving many |
||||
timed operations would be inflated significantly by timing overhead. |
||||
</para> |
||||
|
||||
<para> |
||||
FreeBSD also allows changing the time source on the fly, and |
||||
it logs information about the timer selected during boot: |
||||
</para> |
||||
|
||||
<screen> |
||||
dmesg | grep "Timecounter" |
||||
sysctl kern.timecounter.hardware=TSC |
||||
</screen> |
||||
|
||||
<para> |
||||
Other systems may only allow setting the time source on boot. |
||||
On older Linux systems the "clock" kernel setting is the only way |
||||
to make this sort of change. And even on some more recent ones, |
||||
the only option you'll see for a clock source is "jiffies". Jiffies |
||||
are the older Linux software clock implementation, which can have |
||||
good resolution when it's backed by fast enough timing hardware, |
||||
as in this example: |
||||
</para> |
||||
|
||||
<screen> |
||||
$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource |
||||
jiffies |
||||
$ dmesg | grep time.c |
||||
time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer. |
||||
time.c: Detected 2400.153 MHz processor. |
||||
$ ./pg_test_timing |
||||
Testing timing overhead for 3 seconds. |
||||
Per timing duration including loop overhead: 97.75 ns |
||||
Histogram of timing durations: |
||||
< usec: count percent |
||||
32: 1 0.00000% |
||||
16: 1 0.00000% |
||||
8: 22 0.00007% |
||||
4: 3010 0.00981% |
||||
2: 2993204 9.75277% |
||||
1: 27694571 90.23734% |
||||
</screen> |
||||
|
||||
</sect2> |
||||
<sect2> |
||||
<title>Clock hardware and timing accuracy</title> |
||||
|
||||
<para> |
||||
Collecting accurate timing information is normally done on computers |
||||
using hardware clocks with various levels of accuracy. With some |
||||
hardware the operating systems can pass the system clock time almost |
||||
directly to programs. A system clock can also be derived from a chip |
||||
that simply provides timing interrupts, periodic ticks at some known |
||||
time interval. In either case, operating system kernels provide |
||||
a clock source that hides these details. But the accuracy of that |
||||
clock source and how quickly it can return results varies based |
||||
on the underlying hardware. |
||||
</para> |
||||
|
||||
<para> |
||||
Inaccurate time keeping can result in system instability. Test |
||||
any change to the clock source very carefully. Operating system |
||||
defaults are sometimes made to favor reliability over best |
||||
accuracy. And if you are using a virtual machine, look into the |
||||
recommended time sources compatible with it. Virtual hardware |
||||
faces additional difficulties when emulating timers, and there |
||||
are often per operating system settings suggested by vendors. |
||||
</para> |
||||
|
||||
<para> |
||||
The Time Stamp Counter (TSC) clock source is the most accurate one |
||||
available on current generation CPUs. It's the preferred way to track |
||||
the system time when it's supported by the operating system and the |
||||
TSC clock is reliable. There are several ways that TSC can fail |
||||
to provide an accurate timing source, making it unreliable. Older |
||||
systems can have a TSC clock that varies based on the CPU |
||||
temperature, making it unusable for timing. Trying to use TSC on some |
||||
older multi-core CPUs can give a reported time that's inconsistent |
||||
among multiple cores. This can result in the time going backwards, a |
||||
problem this program checks for. And even the newest systems can |
||||
fail to provide accurate TSC timing with very aggressive power saving |
||||
configurations. |
||||
</para> |
||||
|
||||
<para> |
||||
Newer operating systems may check for the known TSC problems and |
||||
switch to a slower, more stable clock source when they are seen. |
||||
If your system supports TSC time but doesn't default to that, it |
||||
may be disabled for a good reason. And some operating systems may |
||||
not detect all the possible problems correctly, or will allow using |
||||
TSC even in situations where it's known to be inaccurate. |
||||
</para> |
||||
|
||||
<para> |
||||
The High Precision Event Timer (HPET) is the preferred timer on |
||||
systems where it's available and TSC is not accurate. The timer |
||||
chip itself is programmable to allow up to 100 nanosecond resolution, |
||||
but you may not see that much accuracy in your system clock. |
||||
</para> |
||||
|
||||
<para> |
||||
Advanced Configuration and Power Interface (ACPI) provides a |
||||
Power Management (PM) Timer, which Linux refers to as the acpi_pm. |
||||
The clock derived from acpi_pm will at best provide 300 nanosecond |
||||
resolution. |
||||
</para> |
||||
|
||||
<para> |
||||
Timers used on older PC hardware including the 8254 Programmable |
||||
Interval Timer (PIT), the real-time clock (RTC), the Advanced |
||||
Programmable Interrupt Controller (APIC) timer, and the Cyclone |
||||
timer. These timers aim for millisecond resolution. |
||||
</para> |
||||
</sect2> |
||||
|
||||
<sect2> |
||||
<title>Author</title> |
||||
|
||||
<para> |
||||
Ants Aasma <email>ants.aasma@eesti.ee</email> |
||||
</para> |
||||
</sect2> |
||||
|
||||
</sect1> |
||||
Loading…
Reference in new issue