|
|
|
# Macros to detect C compiler features
|
|
|
|
# config/c-compiler.m4
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PRINTF_ARCHETYPE
|
|
|
|
# ---------------------
|
|
|
|
# Select the format archetype to be used by gcc to check printf-type functions.
|
|
|
|
# We prefer "gnu_printf", as that most closely matches the features supported
|
|
|
|
# by src/port/snprintf.c (particularly the %m conversion spec). However,
|
|
|
|
# on some NetBSD versions, that doesn't work while "__syslog__" does.
|
|
|
|
# If all else fails, use "printf".
|
|
|
|
AC_DEFUN([PGAC_PRINTF_ARCHETYPE],
|
|
|
|
[AC_CACHE_CHECK([for printf format archetype], pgac_cv_printf_archetype,
|
|
|
|
[pgac_cv_printf_archetype=gnu_printf
|
|
|
|
PGAC_TEST_PRINTF_ARCHETYPE
|
|
|
|
if [[ "$ac_archetype_ok" = no ]]; then
|
|
|
|
pgac_cv_printf_archetype=__syslog__
|
|
|
|
PGAC_TEST_PRINTF_ARCHETYPE
|
|
|
|
if [[ "$ac_archetype_ok" = no ]]; then
|
|
|
|
pgac_cv_printf_archetype=printf
|
|
|
|
fi
|
|
|
|
fi])
|
|
|
|
AC_DEFINE_UNQUOTED([PG_PRINTF_ATTRIBUTE], [$pgac_cv_printf_archetype],
|
|
|
|
[Define to best printf format archetype, usually gnu_printf if available.])
|
|
|
|
])# PGAC_PRINTF_ARCHETYPE
|
|
|
|
|
|
|
|
# Subroutine: test $pgac_cv_printf_archetype, set $ac_archetype_ok to yes or no
|
|
|
|
AC_DEFUN([PGAC_TEST_PRINTF_ARCHETYPE],
|
|
|
|
[ac_save_c_werror_flag=$ac_c_werror_flag
|
|
|
|
ac_c_werror_flag=yes
|
|
|
|
AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
|
|
|
|
[extern void pgac_write(int ignore, const char *fmt,...)
|
|
|
|
__attribute__((format($pgac_cv_printf_archetype, 2, 3)));],
|
|
|
|
[pgac_write(0, "error %s: %m", "foo");])],
|
|
|
|
[ac_archetype_ok=yes],
|
|
|
|
[ac_archetype_ok=no])
|
|
|
|
ac_c_werror_flag=$ac_save_c_werror_flag
|
|
|
|
])# PGAC_TEST_PRINTF_ARCHETYPE
|
|
|
|
|
|
|
|
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
# PGAC_TYPE_128BIT_INT
|
|
|
|
# --------------------
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
# Check if __int128 is a working 128 bit integer type, and if so
|
|
|
|
# define PG_INT128_TYPE to that typename, and define ALIGNOF_PG_INT128_TYPE
|
|
|
|
# as its alignment requirement.
|
|
|
|
#
|
|
|
|
# This currently only detects a GCC/clang extension, but support for other
|
|
|
|
# environments may be added in the future.
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
#
|
|
|
|
# For the moment we only test for support for 128bit math; support for
|
|
|
|
# 128bit literals and snprintf is not required.
|
|
|
|
AC_DEFUN([PGAC_TYPE_128BIT_INT],
|
|
|
|
[AC_CACHE_CHECK([for __int128], [pgac_cv__128bit_int],
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([
|
|
|
|
/*
|
|
|
|
* We don't actually run this test, just link it to verify that any support
|
|
|
|
* functions needed for __int128 are present.
|
|
|
|
*
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
* These are globals to discourage the compiler from folding all the
|
|
|
|
* arithmetic tests down to compile-time constants. We do not have
|
|
|
|
* convenient support for 128bit literals at this point...
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
*/
|
|
|
|
__int128 a = 48828125;
|
|
|
|
__int128 b = 97656250;
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
],[
|
|
|
|
__int128 c,d;
|
|
|
|
a = (a << 12) + 1; /* 200000000001 */
|
|
|
|
b = (b << 12) + 5; /* 400000000005 */
|
|
|
|
/* try the most relevant arithmetic ops */
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
c = a * b;
|
|
|
|
d = (c + b) / b;
|
|
|
|
/* must use the results, else compiler may optimize arithmetic away */
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
if (d != a+1)
|
|
|
|
return 1;
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
])],
|
|
|
|
[pgac_cv__128bit_int=yes],
|
|
|
|
[pgac_cv__128bit_int=no])])
|
|
|
|
if test x"$pgac_cv__128bit_int" = xyes ; then
|
|
|
|
# Use of non-default alignment with __int128 tickles bugs in some compilers.
|
|
|
|
# If not cross-compiling, we can test for bugs and disable use of __int128
|
|
|
|
# with buggy compilers. If cross-compiling, hope for the best.
|
|
|
|
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83925
|
|
|
|
AC_CACHE_CHECK([for __int128 alignment bug], [pgac_cv__128bit_int_bug],
|
|
|
|
[AC_RUN_IFELSE([AC_LANG_PROGRAM([
|
|
|
|
/* This must match the corresponding code in c.h: */
|
Remove AIX support
There isn't a lot of user demand for AIX support, we have a bunch of
hacks to work around AIX-specific compiler bugs and idiosyncrasies,
and no one has stepped up to the plate to properly maintain it.
Remove support for AIX to get rid of that maintenance overhead. It's
still supported for stable versions.
The acute issue that triggered this decision was that after commit
8af2565248, the AIX buildfarm members have been hitting this
assertion:
TRAP: failed Assert("(uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)"), File: "md.c", Line: 472, PID: 2949728
Apperently the "pg_attribute_aligned(a)" attribute doesn't work on AIX
for values larger than PG_IO_ALIGN_SIZE, for a static const variable.
That could be worked around, but we decided to just drop the AIX support
instead.
Discussion: https://www.postgresql.org/message-id/20240224172345.32@rfd.leadboat.com
Reviewed-by: Andres Freund, Noah Misch, Thomas Munro
1 year ago
|
|
|
#if defined(__GNUC__) || defined(__SUNPRO_C)
|
|
|
|
#define pg_attribute_aligned(a) __attribute__((aligned(a)))
|
|
|
|
#elif defined(_MSC_VER)
|
|
|
|
#define pg_attribute_aligned(a) __declspec(align(a))
|
|
|
|
#endif
|
|
|
|
typedef __int128 int128a
|
|
|
|
#if defined(pg_attribute_aligned)
|
|
|
|
pg_attribute_aligned(8)
|
|
|
|
#endif
|
|
|
|
;
|
|
|
|
int128a holder;
|
|
|
|
void pass_by_val(void *buffer, int128a par) { holder = par; }
|
|
|
|
],[
|
|
|
|
long int i64 = 97656225L << 12;
|
|
|
|
int128a q;
|
|
|
|
pass_by_val(main, (int128a) i64);
|
|
|
|
q = (int128a) i64;
|
|
|
|
if (q != holder)
|
|
|
|
return 1;
|
|
|
|
])],
|
|
|
|
[pgac_cv__128bit_int_bug=ok],
|
|
|
|
[pgac_cv__128bit_int_bug=broken],
|
|
|
|
[pgac_cv__128bit_int_bug="assuming ok"])])
|
|
|
|
if test x"$pgac_cv__128bit_int_bug" != xbroken ; then
|
|
|
|
AC_DEFINE(PG_INT128_TYPE, __int128, [Define to the name of a signed 128-bit integer type.])
|
|
|
|
AC_CHECK_ALIGNOF(PG_INT128_TYPE)
|
|
|
|
fi
|
Add, optional, support for 128bit integers.
We will, for the foreseeable future, not expose 128 bit datatypes to
SQL. But being able to use 128bit math will allow us, in a later patch,
to use 128bit accumulators for some aggregates; leading to noticeable
speedups over using numeric.
So far we only detect a gcc/clang extension that supports 128bit math,
but no 128bit literals, and no *printf support. We might want to expand
this in the future to further compilers; if there are any that that
provide similar support.
Discussion: 544BB5F1.50709@proxel.se
Author: Andreas Karlsson, with significant editorializing by me
Reviewed-By: Peter Geoghegan, Oskari Saarenmaa
10 years ago
|
|
|
fi])# PGAC_TYPE_128BIT_INT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_C_STATIC_ASSERT
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
# --------------------
|
|
|
|
# Check if the C compiler understands _Static_assert(),
|
|
|
|
# and define HAVE__STATIC_ASSERT if so.
|
|
|
|
#
|
|
|
|
# We actually check the syntax ({ _Static_assert(...) }), because we need
|
|
|
|
# gcc-style compound expressions to be able to wrap the thing into macros.
|
|
|
|
AC_DEFUN([PGAC_C_STATIC_ASSERT],
|
|
|
|
[AC_CACHE_CHECK(for _Static_assert, pgac_cv__static_assert,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
|
|
|
[({ _Static_assert(1, "foo"); })])],
|
|
|
|
[pgac_cv__static_assert=yes],
|
|
|
|
[pgac_cv__static_assert=no])])
|
|
|
|
if test x"$pgac_cv__static_assert" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE__STATIC_ASSERT, 1,
|
|
|
|
[Define to 1 if your compiler understands _Static_assert.])
|
|
|
|
fi])# PGAC_C_STATIC_ASSERT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_C_TYPEOF
|
|
|
|
# -------------
|
|
|
|
# Check if the C compiler understands typeof or a variant. Define
|
|
|
|
# HAVE_TYPEOF if so, and define 'typeof' to the actual key word.
|
|
|
|
#
|
|
|
|
AC_DEFUN([PGAC_C_TYPEOF],
|
|
|
|
[AC_CACHE_CHECK(for typeof, pgac_cv_c_typeof,
|
|
|
|
[pgac_cv_c_typeof=no
|
|
|
|
for pgac_kw in typeof __typeof__; do
|
|
|
|
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
|
|
|
|
[int x = 0;
|
|
|
|
$pgac_kw(x) y;
|
|
|
|
y = x;
|
|
|
|
return y;])],
|
|
|
|
[pgac_cv_c_typeof=$pgac_kw])
|
|
|
|
test "$pgac_cv_c_typeof" != no && break
|
|
|
|
done])
|
|
|
|
if test "$pgac_cv_c_typeof" != no; then
|
|
|
|
AC_DEFINE(HAVE_TYPEOF, 1,
|
|
|
|
[Define to 1 if your compiler understands `typeof' or something similar.])
|
|
|
|
if test "$pgac_cv_c_typeof" != typeof; then
|
|
|
|
AC_DEFINE_UNQUOTED(typeof, $pgac_cv_c_typeof, [Define to how the compiler spells `typeof'.])
|
|
|
|
fi
|
|
|
|
fi])# PGAC_C_TYPEOF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_C_TYPES_COMPATIBLE
|
|
|
|
# -----------------------
|
|
|
|
# Check if the C compiler understands __builtin_types_compatible_p,
|
|
|
|
# and define HAVE__BUILTIN_TYPES_COMPATIBLE_P if so.
|
|
|
|
#
|
|
|
|
# We check usage with __typeof__, though it's unlikely any compiler would
|
|
|
|
# have the former and not the latter.
|
|
|
|
AC_DEFUN([PGAC_C_TYPES_COMPATIBLE],
|
|
|
|
[AC_CACHE_CHECK(for __builtin_types_compatible_p, pgac_cv__types_compatible,
|
|
|
|
[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
|
|
|
|
[[ int x; static int y[__builtin_types_compatible_p(__typeof__(x), int)]; ]])],
|
|
|
|
[pgac_cv__types_compatible=yes],
|
|
|
|
[pgac_cv__types_compatible=no])])
|
|
|
|
if test x"$pgac_cv__types_compatible" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE__BUILTIN_TYPES_COMPATIBLE_P, 1,
|
|
|
|
[Define to 1 if your compiler understands __builtin_types_compatible_p.])
|
|
|
|
fi])# PGAC_C_TYPES_COMPATIBLE
|
|
|
|
|
|
|
|
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
# PGAC_C_BUILTIN_CONSTANT_P
|
|
|
|
# -------------------------
|
|
|
|
# Check if the C compiler understands __builtin_constant_p(),
|
|
|
|
# and define HAVE__BUILTIN_CONSTANT_P if so.
|
|
|
|
# We need __builtin_constant_p("string literal") to be true, but some older
|
|
|
|
# compilers don't think that, so test for that case explicitly.
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
AC_DEFUN([PGAC_C_BUILTIN_CONSTANT_P],
|
|
|
|
[AC_CACHE_CHECK(for __builtin_constant_p, pgac_cv__builtin_constant_p,
|
|
|
|
[AC_COMPILE_IFELSE([AC_LANG_SOURCE(
|
|
|
|
[[static int x;
|
|
|
|
static int y[__builtin_constant_p(x) ? x : 1];
|
|
|
|
static int z[__builtin_constant_p("string literal") ? 1 : x];
|
|
|
|
]]
|
|
|
|
)],
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
[pgac_cv__builtin_constant_p=yes],
|
|
|
|
[pgac_cv__builtin_constant_p=no])])
|
|
|
|
if test x"$pgac_cv__builtin_constant_p" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE__BUILTIN_CONSTANT_P, 1,
|
|
|
|
[Define to 1 if your compiler understands __builtin_constant_p.])
|
|
|
|
fi])# PGAC_C_BUILTIN_CONSTANT_P
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_C_BUILTIN_OP_OVERFLOW
|
|
|
|
# --------------------------
|
|
|
|
# Check if the C compiler understands __builtin_$op_overflow(),
|
|
|
|
# and define HAVE__BUILTIN_OP_OVERFLOW if so.
|
|
|
|
#
|
|
|
|
# Check for the most complicated case, 64 bit multiplication, as a
|
|
|
|
# proxy for all of the operations. To detect the case where the compiler
|
|
|
|
# knows the function but library support is missing, we must link not just
|
|
|
|
# compile, and store the results in global variables so the compiler doesn't
|
|
|
|
# optimize away the call.
|
|
|
|
AC_DEFUN([PGAC_C_BUILTIN_OP_OVERFLOW],
|
|
|
|
[AC_CACHE_CHECK(for __builtin_mul_overflow, pgac_cv__builtin_op_overflow,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
#include <stdint.h>
|
|
|
|
int64_t a = 1;
|
|
|
|
int64_t b = 1;
|
|
|
|
int64_t result;
|
|
|
|
int oflo;
|
|
|
|
],
|
|
|
|
[oflo = __builtin_mul_overflow(a, b, &result);])],
|
|
|
|
[pgac_cv__builtin_op_overflow=yes],
|
|
|
|
[pgac_cv__builtin_op_overflow=no])])
|
|
|
|
if test x"$pgac_cv__builtin_op_overflow" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE__BUILTIN_OP_OVERFLOW, 1,
|
|
|
|
[Define to 1 if your compiler understands __builtin_$op_overflow.])
|
|
|
|
fi])# PGAC_C_BUILTIN_OP_OVERFLOW
|
|
|
|
|
|
|
|
|
|
|
|
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
# PGAC_C_BUILTIN_UNREACHABLE
|
|
|
|
# --------------------------
|
|
|
|
# Check if the C compiler understands __builtin_unreachable(),
|
|
|
|
# and define HAVE__BUILTIN_UNREACHABLE if so.
|
|
|
|
#
|
|
|
|
# NB: Don't get the idea of putting a for(;;); or such before the
|
|
|
|
# __builtin_unreachable() call. Some compilers would remove it before linking
|
|
|
|
# and only a warning instead of an error would be produced.
|
|
|
|
AC_DEFUN([PGAC_C_BUILTIN_UNREACHABLE],
|
|
|
|
[AC_CACHE_CHECK(for __builtin_unreachable, pgac_cv__builtin_unreachable,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
|
|
|
[__builtin_unreachable();])],
|
Improve handling of ereport(ERROR) and elog(ERROR).
In commit 71450d7fd6c7cf7b3e38ac56e363bff6a681973c, we added code to inform
suitably-intelligent compilers that ereport() doesn't return if the elevel
is ERROR or higher. This patch extends that to elog(), and also fixes a
double-evaluation hazard that the previous commit created in ereport(),
as well as reducing the emitted code size.
The elog() improvement requires the compiler to support __VA_ARGS__, which
should be available in just about anything nowadays since it's required by
C99. But our minimum language baseline is still C89, so add a configure
test for that.
The previous commit assumed that ereport's elevel could be evaluated twice,
which isn't terribly safe --- there are already counterexamples in xlog.c.
On compilers that have __builtin_constant_p, we can use that to protect the
second test, since there's no possible optimization gain if the compiler
doesn't know the value of elevel. Otherwise, use a local variable inside
the macros to prevent double evaluation. The local-variable solution is
inferior because (a) it leads to useless code being emitted when elevel
isn't constant, and (b) it increases the optimization level needed for the
compiler to recognize that subsequent code is unreachable. But it seems
better than not teaching non-gcc compilers about unreachability at all.
Lastly, if the compiler has __builtin_unreachable(), we can use that
instead of abort(), resulting in a noticeable code savings since no
function call is actually emitted. However, it seems wise to do this only
in non-assert builds. In an assert build, continue to use abort(), so that
the behavior will be predictable and debuggable if the "impossible"
happens.
These changes involve making the ereport and elog macros emit do-while
statement blocks not just expressions, which forces small changes in
a few call sites.
Andres Freund, Tom Lane, Heikki Linnakangas
13 years ago
|
|
|
[pgac_cv__builtin_unreachable=yes],
|
|
|
|
[pgac_cv__builtin_unreachable=no])])
|
|
|
|
if test x"$pgac_cv__builtin_unreachable" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE__BUILTIN_UNREACHABLE, 1,
|
|
|
|
[Define to 1 if your compiler understands __builtin_unreachable.])
|
|
|
|
fi])# PGAC_C_BUILTIN_UNREACHABLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_C_COMPUTED_GOTO
|
|
|
|
# --------------------
|
|
|
|
# Check if the C compiler knows computed gotos (gcc extension, also
|
|
|
|
# available in at least clang). If so, define HAVE_COMPUTED_GOTO.
|
|
|
|
#
|
|
|
|
# Checking whether computed gotos are supported syntax-wise ought to
|
|
|
|
# be enough, as the syntax is otherwise illegal.
|
|
|
|
AC_DEFUN([PGAC_C_COMPUTED_GOTO],
|
|
|
|
[AC_CACHE_CHECK(for computed goto support, pgac_cv_computed_goto,
|
|
|
|
[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
|
|
|
|
[[void *labeladdrs[] = {&&my_label};
|
|
|
|
goto *labeladdrs[0];
|
|
|
|
my_label:
|
|
|
|
return 1;
|
|
|
|
]])],
|
|
|
|
[pgac_cv_computed_goto=yes],
|
|
|
|
[pgac_cv_computed_goto=no])])
|
|
|
|
if test x"$pgac_cv_computed_goto" = xyes ; then
|
|
|
|
AC_DEFINE(HAVE_COMPUTED_GOTO, 1,
|
|
|
|
[Define to 1 if your compiler handles computed gotos.])
|
|
|
|
fi])# PGAC_C_COMPUTED_GOTO
|
|
|
|
|
|
|
|
|
|
|
|
|
Make use of compiler builtins and/or assembly for CLZ, CTZ, POPCNT.
Test for the compiler builtins __builtin_clz, __builtin_ctz, and
__builtin_popcount, and make use of these in preference to
handwritten C code if they're available. Create src/port
infrastructure for "leftmost one", "rightmost one", and "popcount"
so as to centralize these decisions.
On x86_64, __builtin_popcount generally won't make use of the POPCNT
opcode because that's not universally supported yet. Provide code
that checks CPUID and then calls POPCNT via asm() if available.
This requires indirecting through a function pointer, which is
an annoying amount of overhead for a one-instruction operation,
but it's probably not worth working harder than this for our
current use-cases.
I'm not sure we've found all the existing places that could profit
from this new infrastructure; but we at least touched all the
ones that used copied-and-pasted versions of the bitmapset.c code,
and got rid of multiple copies of the associated constant arrays.
While at it, replace c-compiler.m4's one-per-builtin-function
macros with a single one that can handle all the cases we need
to worry about so far. Also, because I'm paranoid, make those
checks into AC_LINK checks rather than just AC_COMPILE; the
former coding failed to verify that libgcc has support for the
builtin, in cases where it's not inline code.
David Rowley, Thomas Munro, Alvaro Herrera, Tom Lane
Discussion: https://postgr.es/m/CAKJS1f9WTAGG1tPeJnD18hiQW5gAk59fQ6WK-vfdAKEHyRg2RA@mail.gmail.com
6 years ago
|
|
|
# PGAC_CHECK_BUILTIN_FUNC
|
|
|
|
# -----------------------
|
|
|
|
# This is similar to AC_CHECK_FUNCS(), except that it will work for compiler
|
|
|
|
# builtin functions, as that usually fails to.
|
|
|
|
# The first argument is the function name, eg [__builtin_clzl], and the
|
|
|
|
# second is its argument list, eg [unsigned long x]. The current coding
|
|
|
|
# works only for a single argument named x; we might generalize that later.
|
|
|
|
# It's assumed that the function's result type is coercible to int.
|
|
|
|
# On success, we define "HAVEfuncname" (there's usually more than enough
|
|
|
|
# underscores already, so we don't add another one).
|
|
|
|
AC_DEFUN([PGAC_CHECK_BUILTIN_FUNC],
|
|
|
|
[AC_CACHE_CHECK(for $1, pgac_cv$1,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([
|
|
|
|
int
|
|
|
|
call$1($2)
|
|
|
|
{
|
|
|
|
return $1(x);
|
|
|
|
}], [])],
|
|
|
|
[pgac_cv$1=yes],
|
|
|
|
[pgac_cv$1=no])])
|
|
|
|
if test x"${pgac_cv$1}" = xyes ; then
|
|
|
|
AC_DEFINE_UNQUOTED(AS_TR_CPP([HAVE$1]), 1,
|
|
|
|
[Define to 1 if your compiler understands $1.])
|
|
|
|
fi])# PGAC_CHECK_BUILTIN_FUNC
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_CHECK_BUILTIN_FUNC_PTR
|
|
|
|
# -----------------------
|
|
|
|
# Like PGAC_CHECK_BUILTIN_FUNC, except that the function is assumed to
|
|
|
|
# return a pointer type, and the argument(s) should be given literally.
|
|
|
|
# This handles some cases that PGAC_CHECK_BUILTIN_FUNC doesn't.
|
|
|
|
AC_DEFUN([PGAC_CHECK_BUILTIN_FUNC_PTR],
|
|
|
|
[AC_CACHE_CHECK(for $1, pgac_cv$1,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([
|
|
|
|
void *
|
|
|
|
call$1(void)
|
|
|
|
{
|
|
|
|
return $1($2);
|
|
|
|
}], [])],
|
|
|
|
[pgac_cv$1=yes],
|
|
|
|
[pgac_cv$1=no])])
|
|
|
|
if test x"${pgac_cv$1}" = xyes ; then
|
|
|
|
AC_DEFINE_UNQUOTED(AS_TR_CPP([HAVE$1]), 1,
|
|
|
|
[Define to 1 if your compiler understands $1.])
|
|
|
|
fi])# PGAC_CHECK_BUILTIN_FUNC_PTR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_VARCC_VARFLAGS_OPT
|
|
|
|
# ----------------------------
|
|
|
|
# Given a compiler, variable name and a string, check if the compiler
|
|
|
|
# supports the string as a command-line option. If it does, add the
|
|
|
|
# string to the given variable.
|
|
|
|
AC_DEFUN([PGAC_PROG_VARCC_VARFLAGS_OPT],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_prog_$1_cflags_$3])])dnl
|
|
|
|
AC_CACHE_CHECK([whether ${$1} supports $3, for $2], [Ac_cachevar],
|
|
|
|
[pgac_save_CFLAGS=$CFLAGS
|
|
|
|
pgac_save_CC=$CC
|
|
|
|
CC=${$1}
|
|
|
|
CFLAGS="${$2} $3"
|
|
|
|
ac_save_c_werror_flag=$ac_c_werror_flag
|
|
|
|
ac_c_werror_flag=yes
|
|
|
|
_AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])
|
|
|
|
ac_c_werror_flag=$ac_save_c_werror_flag
|
|
|
|
CFLAGS="$pgac_save_CFLAGS"
|
|
|
|
CC="$pgac_save_CC"])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
$2="${$2} $3"
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_PROG_VARCC_VARFLAGS_OPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_CC_CFLAGS_OPT
|
|
|
|
# -----------------------
|
|
|
|
# Given a string, check if the compiler supports the string as a
|
|
|
|
# command-line option. If it does, add the string to CFLAGS.
|
|
|
|
AC_DEFUN([PGAC_PROG_CC_CFLAGS_OPT], [
|
|
|
|
PGAC_PROG_VARCC_VARFLAGS_OPT(CC, CFLAGS, $1)
|
|
|
|
])# PGAC_PROG_CC_CFLAGS_OPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_CC_VAR_OPT
|
|
|
|
# --------------------
|
|
|
|
# Given a variable name and a string, check if the compiler supports
|
|
|
|
# the string as a command-line option. If it does, add the string to
|
|
|
|
# the given variable.
|
|
|
|
AC_DEFUN([PGAC_PROG_CC_VAR_OPT],
|
|
|
|
[PGAC_PROG_VARCC_VARFLAGS_OPT(CC, $1, $2)
|
|
|
|
])# PGAC_PROG_CC_VAR_OPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_VARCXX_VARFLAGS_OPT
|
|
|
|
# -----------------------------
|
|
|
|
# Given a compiler, variable name and a string, check if the compiler
|
|
|
|
# supports the string as a command-line option. If it does, add the
|
|
|
|
# string to the given variable.
|
|
|
|
AC_DEFUN([PGAC_PROG_VARCXX_VARFLAGS_OPT],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_prog_$1_cxxflags_$3])])dnl
|
|
|
|
AC_CACHE_CHECK([whether ${$1} supports $3, for $2], [Ac_cachevar],
|
|
|
|
[pgac_save_CXXFLAGS=$CXXFLAGS
|
|
|
|
pgac_save_CXX=$CXX
|
|
|
|
CXX=${$1}
|
|
|
|
CXXFLAGS="${$2} $3"
|
|
|
|
ac_save_cxx_werror_flag=$ac_cxx_werror_flag
|
|
|
|
ac_cxx_werror_flag=yes
|
|
|
|
AC_LANG_PUSH(C++)
|
|
|
|
_AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])
|
|
|
|
AC_LANG_POP([])
|
|
|
|
ac_cxx_werror_flag=$ac_save_cxx_werror_flag
|
|
|
|
CXXFLAGS="$pgac_save_CXXFLAGS"
|
|
|
|
CXX="$pgac_save_CXX"])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
$2="${$2} $3"
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_PROG_VARCXX_VARFLAGS_OPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_CXX_CFLAGS_OPT
|
|
|
|
# ------------------------
|
|
|
|
# Given a string, check if the compiler supports the string as a
|
|
|
|
# command-line option. If it does, add the string to CXXFLAGS.
|
|
|
|
AC_DEFUN([PGAC_PROG_CXX_CFLAGS_OPT],
|
|
|
|
[PGAC_PROG_VARCXX_VARFLAGS_OPT(CXX, CXXFLAGS, $1)
|
|
|
|
])# PGAC_PROG_CXX_CFLAGS_OPT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# PGAC_PROG_CC_LD_VARFLAGS_OPT
|
|
|
|
# ------------------------
|
|
|
|
# Given a string, check if the compiler supports the string as a
|
|
|
|
# command-line option. If it does, add to the given variable.
|
|
|
|
# For reasons you'd really rather not know about, this checks whether
|
|
|
|
# you can link to a particular function, not just whether you can link.
|
|
|
|
# In fact, we must actually check that the resulting program runs :-(
|
|
|
|
AC_DEFUN([PGAC_PROG_CC_LD_VARFLAGS_OPT],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_prog_cc_$1_$2])])dnl
|
|
|
|
AC_CACHE_CHECK([whether $CC supports $2, for $1], [Ac_cachevar],
|
|
|
|
[pgac_save_LDFLAGS=$LDFLAGS
|
|
|
|
LDFLAGS="$pgac_save_LDFLAGS $2"
|
|
|
|
AC_RUN_IFELSE([AC_LANG_PROGRAM([extern void $3 (); void (*fptr) () = $3;],[])],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no],
|
|
|
|
[Ac_cachevar="assuming no"])
|
|
|
|
LDFLAGS="$pgac_save_LDFLAGS"])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
$1="${$1} $2"
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_PROG_CC_LD_VARFLAGS_OPT
|
|
|
|
|
|
|
|
# PGAC_PROG_CC_LDFLAGS_OPT
|
|
|
|
# ------------------------
|
|
|
|
# Convenience wrapper around PGAC_PROG_CC_LD_VARFLAGS_OPT that adds to
|
|
|
|
# LDFLAGS.
|
|
|
|
AC_DEFUN([PGAC_PROG_CC_LDFLAGS_OPT],
|
|
|
|
[PGAC_PROG_CC_LD_VARFLAGS_OPT(LDFLAGS, [$1], [$2])
|
|
|
|
])# PGAC_PROG_CC_LDFLAGS_OPT
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
|
|
|
|
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# PGAC_HAVE_GCC__SYNC_CHAR_TAS
|
|
|
|
# ----------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __sync_lock_test_and_set(char),
|
|
|
|
# and define HAVE_GCC__SYNC_CHAR_TAS
|
|
|
|
#
|
|
|
|
# NB: There are platforms where test_and_set is available but compare_and_swap
|
|
|
|
# is not, so test this separately.
|
|
|
|
# NB: Some platforms only do 32bit tas, others only do 8bit tas. Test both.
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__SYNC_CHAR_TAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __sync char locking functions, pgac_cv_gcc_sync_char_tas,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[char lock = 0;
|
|
|
|
__sync_lock_test_and_set(&lock, 1);
|
|
|
|
__sync_lock_release(&lock);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_sync_char_tas="yes"],
|
|
|
|
[pgac_cv_gcc_sync_char_tas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_sync_char_tas" = x"yes"; then
|
|
|
|
AC_DEFINE(HAVE_GCC__SYNC_CHAR_TAS, 1, [Define to 1 if you have __sync_lock_test_and_set(char *) and friends.])
|
|
|
|
fi])# PGAC_HAVE_GCC__SYNC_CHAR_TAS
|
|
|
|
|
|
|
|
# PGAC_HAVE_GCC__SYNC_INT32_TAS
|
|
|
|
# -----------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __sync_lock_test_and_set(),
|
|
|
|
# and define HAVE_GCC__SYNC_INT32_TAS
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT32_TAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __sync int32 locking functions, pgac_cv_gcc_sync_int32_tas,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[int lock = 0;
|
|
|
|
__sync_lock_test_and_set(&lock, 1);
|
|
|
|
__sync_lock_release(&lock);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_sync_int32_tas="yes"],
|
|
|
|
[pgac_cv_gcc_sync_int32_tas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_sync_int32_tas" = x"yes"; then
|
|
|
|
AC_DEFINE(HAVE_GCC__SYNC_INT32_TAS, 1, [Define to 1 if you have __sync_lock_test_and_set(int *) and friends.])
|
|
|
|
fi])# PGAC_HAVE_GCC__SYNC_INT32_TAS
|
|
|
|
|
|
|
|
# PGAC_HAVE_GCC__SYNC_INT32_CAS
|
|
|
|
# -----------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __sync_compare_and_swap() for 32bit
|
|
|
|
# types, and define HAVE_GCC__SYNC_INT32_CAS if so.
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT32_CAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __sync int32 atomic operations, pgac_cv_gcc_sync_int32_cas,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[int val = 0;
|
|
|
|
__sync_val_compare_and_swap(&val, 0, 37);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_sync_int32_cas="yes"],
|
|
|
|
[pgac_cv_gcc_sync_int32_cas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_sync_int32_cas" = x"yes"; then
|
|
|
|
AC_DEFINE(HAVE_GCC__SYNC_INT32_CAS, 1, [Define to 1 if you have __sync_val_compare_and_swap(int *, int, int).])
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
fi])# PGAC_HAVE_GCC__SYNC_INT32_CAS
|
|
|
|
|
|
|
|
# PGAC_HAVE_GCC__SYNC_INT64_CAS
|
|
|
|
# -----------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __sync_compare_and_swap() for 64bit
|
|
|
|
# types, and define HAVE_GCC__SYNC_INT64_CAS if so.
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT64_CAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __sync int64 atomic operations, pgac_cv_gcc_sync_int64_cas,
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>],
|
|
|
|
[int64_t lock = 0;
|
|
|
|
__sync_val_compare_and_swap(&lock, 0, (int64_t) 37);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_sync_int64_cas="yes"],
|
|
|
|
[pgac_cv_gcc_sync_int64_cas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_sync_int64_cas" = x"yes"; then
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
AC_DEFINE(HAVE_GCC__SYNC_INT64_CAS, 1, [Define to 1 if you have __sync_val_compare_and_swap(int64_t *, int64_t, int64_t).])
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
fi])# PGAC_HAVE_GCC__SYNC_INT64_CAS
|
|
|
|
|
|
|
|
# PGAC_HAVE_GCC__ATOMIC_INT32_CAS
|
|
|
|
# -------------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __atomic_compare_exchange_n() for 32bit
|
|
|
|
# types, and define HAVE_GCC__ATOMIC_INT32_CAS if so.
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__ATOMIC_INT32_CAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __atomic int32 atomic operations, pgac_cv_gcc_atomic_int32_cas,
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[int val = 0;
|
|
|
|
int expect = 0;
|
|
|
|
__atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_atomic_int32_cas="yes"],
|
|
|
|
[pgac_cv_gcc_atomic_int32_cas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_atomic_int32_cas" = x"yes"; then
|
|
|
|
AC_DEFINE(HAVE_GCC__ATOMIC_INT32_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int *, int *, int).])
|
|
|
|
fi])# PGAC_HAVE_GCC__ATOMIC_INT32_CAS
|
|
|
|
|
|
|
|
# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
|
|
|
|
# -------------------------------
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
# Check if the C compiler understands __atomic_compare_exchange_n() for 64bit
|
|
|
|
# types, and define HAVE_GCC__ATOMIC_INT64_CAS if so.
|
|
|
|
AC_DEFUN([PGAC_HAVE_GCC__ATOMIC_INT64_CAS],
|
|
|
|
[AC_CACHE_CHECK(for builtin __atomic int64 atomic operations, pgac_cv_gcc_atomic_int64_cas,
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>],
|
|
|
|
[int64_t val = 0;
|
|
|
|
int64_t expect = 0;
|
|
|
|
__atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);])],
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
[pgac_cv_gcc_atomic_int64_cas="yes"],
|
|
|
|
[pgac_cv_gcc_atomic_int64_cas="no"])])
|
|
|
|
if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
|
|
|
|
AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).])
|
Add a basic atomic ops API abstracting away platform/architecture details.
Several upcoming performance/scalability improvements require atomic
operations. This new API avoids the need to splatter compiler and
architecture dependent code over all the locations employing atomic
ops.
For several of the potential usages it'd be problematic to maintain
both, a atomics using implementation and one using spinlocks or
similar. In all likelihood one of the implementations would not get
tested regularly under concurrency. To avoid that scenario the new API
provides a automatic fallback of atomic operations to spinlocks. All
properties of atomic operations are maintained. This fallback -
obviously - isn't as fast as just using atomic ops, but it's not bad
either. For one of the future users the atomics ontop spinlocks
implementation was actually slightly faster than the old purely
spinlock using implementation. That's important because it reduces the
fear of regressing older platforms when improving the scalability for
new ones.
The API, loosely modeled after the C11 atomics support, currently
provides 'atomic flags' and 32 bit unsigned integers. If the platform
efficiently supports atomic 64 bit unsigned integers those are also
provided.
To implement atomics support for a platform/architecture/compiler for
a type of atomics 32bit compare and exchange needs to be
implemented. If available and more efficient native support for flags,
32 bit atomic addition, and corresponding 64 bit operations may also
be provided. Additional useful atomic operations are implemented
generically ontop of these.
The implementation for various versions of gcc, msvc and sun studio have
been tested. Additional existing stub implementations for
* Intel icc
* HUPX acc
* IBM xlc
are included but have never been tested. These will likely require
fixes based on buildfarm and user feedback.
As atomic operations also require barriers for some operations the
existing barrier support has been moved into the atomics code.
Author: Andres Freund with contributions from Oskari Saarenmaa
Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera
Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com,
20131015123303.GH5300@awork2.anarazel.de,
20131028205522.GI20248@awork2.anarazel.de
11 years ago
|
|
|
fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
|
Use Intel SSE 4.2 CRC instructions where available.
Modern x86 and x86-64 processors with SSE 4.2 support have special
instructions, crc32b and crc32q, for calculating CRC-32C. They greatly
speed up CRC calculation.
Whether the instructions can be used or not depends on the compiler and the
target architecture. If generation of SSE 4.2 instructions is allowed for
the target (-msse4.2 flag on gcc and clang), use them. If they are not
allowed by default, but the compiler supports the -msse4.2 flag to enable
them, compile just the CRC-32C function with -msse4.2 flag, and check at
runtime whether the processor we're running on supports it. If it doesn't,
fall back to the slicing-by-8 algorithm. (With the common defaults on
current operating systems, the runtime-check variant is what you get in
practice.)
Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
10 years ago
|
|
|
|
|
|
|
# PGAC_SSE42_CRC32_INTRINSICS
|
|
|
|
# ---------------------------
|
|
|
|
# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
|
|
|
|
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
|
|
|
|
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
|
|
|
|
# the other ones are, on x86-64 platforms)
|
|
|
|
#
|
Use __attribute__((target(...))) for SSE4.2 CRC-32C support.
Presently, we check for compiler support for the required
intrinsics both with and without the -msse4.2 compiler flag, and
then depending on the results of those checks, we pick which files
to compile with which flags. This is tedious and complicated, and
it results in unsustainable coding patterns such as separate files
for each portion of code that may need to be built with different
compiler flags.
This commit makes use of the newly-added support for
__attribute__((target(...))) in the SSE4.2 CRC-32C code. This
simplifies both the configure-time checks and the build scripts,
and it allows us to place the functions that use the intrinsics in
files that we otherwise do not want to build with special CPU
instructions (although this commit refrains from doing so). This
is also preparatory work for a proposed follow-up commit that will
further optimize the CRC-32C code with AVX-512 instructions.
While at it, this commit modifies meson's checks for SSE4.2 CRC
support to be the same as autoconf's. meson was choosing whether
to use a runtime check based purely on whether -msse4.2 is
required, while autoconf has long checked for the __SSE4_2__
preprocessor symbol to decide. meson's previous approach seems to
work just fine, but this change avoids needing to build multiple
test programs and to keep track of whether to actually use
pg_attribute_target().
Ideally we'd use __attribute__((target(...))) for ARMv8 CRC
support, too, but there's little point in doing so because until
clang 16, using the ARM intrinsics still requires special compiler
flags. Perhaps we can re-evaluate this decision after some time
has passed.
Author: Raghuveer Devulapalli
Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
7 months ago
|
|
|
# If the intrinsics are supported, sets pgac_sse42_crc32_intrinsics.
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
#
|
|
|
|
# To detect the case where the compiler knows the function but library support
|
|
|
|
# is missing, we must link not just compile, and store the results in global
|
|
|
|
# variables so the compiler doesn't optimize away the call.
|
Use Intel SSE 4.2 CRC instructions where available.
Modern x86 and x86-64 processors with SSE 4.2 support have special
instructions, crc32b and crc32q, for calculating CRC-32C. They greatly
speed up CRC calculation.
Whether the instructions can be used or not depends on the compiler and the
target architecture. If generation of SSE 4.2 instructions is allowed for
the target (-msse4.2 flag on gcc and clang), use them. If they are not
allowed by default, but the compiler supports the -msse4.2 flag to enable
them, compile just the CRC-32C function with -msse4.2 flag, and check at
runtime whether the processor we're running on supports it. If it doesn't,
fall back to the slicing-by-8 algorithm. (With the common defaults on
current operating systems, the runtime-check variant is what you get in
practice.)
Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
10 years ago
|
|
|
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
|
Use __attribute__((target(...))) for SSE4.2 CRC-32C support.
Presently, we check for compiler support for the required
intrinsics both with and without the -msse4.2 compiler flag, and
then depending on the results of those checks, we pick which files
to compile with which flags. This is tedious and complicated, and
it results in unsustainable coding patterns such as separate files
for each portion of code that may need to be built with different
compiler flags.
This commit makes use of the newly-added support for
__attribute__((target(...))) in the SSE4.2 CRC-32C code. This
simplifies both the configure-time checks and the build scripts,
and it allows us to place the functions that use the intrinsics in
files that we otherwise do not want to build with special CPU
instructions (although this commit refrains from doing so). This
is also preparatory work for a proposed follow-up commit that will
further optimize the CRC-32C code with AVX-512 instructions.
While at it, this commit modifies meson's checks for SSE4.2 CRC
support to be the same as autoconf's. meson was choosing whether
to use a runtime check based purely on whether -msse4.2 is
required, while autoconf has long checked for the __SSE4_2__
preprocessor symbol to decide. meson's previous approach seems to
work just fine, but this change avoids needing to build multiple
test programs and to keep track of whether to actually use
pg_attribute_target().
Ideally we'd use __attribute__((target(...))) for ARMv8 CRC
support, too, but there's little point in doing so because until
clang 16, using the ARM intrinsics still requires special compiler
flags. Perhaps we can re-evaluate this decision after some time
has passed.
Author: Raghuveer Devulapalli
Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
7 months ago
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32], [Ac_cachevar],
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
unsigned int crc;
|
Use __attribute__((target(...))) for SSE4.2 CRC-32C support.
Presently, we check for compiler support for the required
intrinsics both with and without the -msse4.2 compiler flag, and
then depending on the results of those checks, we pick which files
to compile with which flags. This is tedious and complicated, and
it results in unsustainable coding patterns such as separate files
for each portion of code that may need to be built with different
compiler flags.
This commit makes use of the newly-added support for
__attribute__((target(...))) in the SSE4.2 CRC-32C code. This
simplifies both the configure-time checks and the build scripts,
and it allows us to place the functions that use the intrinsics in
files that we otherwise do not want to build with special CPU
instructions (although this commit refrains from doing so). This
is also preparatory work for a proposed follow-up commit that will
further optimize the CRC-32C code with AVX-512 instructions.
While at it, this commit modifies meson's checks for SSE4.2 CRC
support to be the same as autoconf's. meson was choosing whether
to use a runtime check based purely on whether -msse4.2 is
required, while autoconf has long checked for the __SSE4_2__
preprocessor symbol to decide. meson's previous approach seems to
work just fine, but this change avoids needing to build multiple
test programs and to keep track of whether to actually use
pg_attribute_target().
Ideally we'd use __attribute__((target(...))) for ARMv8 CRC
support, too, but there's little point in doing so because until
clang 16, using the ARM intrinsics still requires special compiler
flags. Perhaps we can re-evaluate this decision after some time
has passed.
Author: Raghuveer Devulapalli
Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
7 months ago
|
|
|
#if defined(__has_attribute) && __has_attribute (target)
|
|
|
|
__attribute__((target("sse4.2")))
|
|
|
|
#endif
|
|
|
|
static int crc32_sse42_test(void)
|
|
|
|
{
|
|
|
|
crc = _mm_crc32_u8(crc, 0);
|
|
|
|
crc = _mm_crc32_u32(crc, 0);
|
|
|
|
/* return computed value, to prevent the above being optimized away */
|
|
|
|
return crc == 0;
|
|
|
|
}],
|
|
|
|
[return crc32_sse42_test();])],
|
Use Intel SSE 4.2 CRC instructions where available.
Modern x86 and x86-64 processors with SSE 4.2 support have special
instructions, crc32b and crc32q, for calculating CRC-32C. They greatly
speed up CRC calculation.
Whether the instructions can be used or not depends on the compiler and the
target architecture. If generation of SSE 4.2 instructions is allowed for
the target (-msse4.2 flag on gcc and clang), use them. If they are not
allowed by default, but the compiler supports the -msse4.2 flag to enable
them, compile just the CRC-32C function with -msse4.2 flag, and check at
runtime whether the processor we're running on supports it. If it doesn't,
fall back to the slicing-by-8 algorithm. (With the common defaults on
current operating systems, the runtime-check variant is what you get in
practice.)
Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
10 years ago
|
|
|
[Ac_cachevar=yes],
|
Use __attribute__((target(...))) for SSE4.2 CRC-32C support.
Presently, we check for compiler support for the required
intrinsics both with and without the -msse4.2 compiler flag, and
then depending on the results of those checks, we pick which files
to compile with which flags. This is tedious and complicated, and
it results in unsustainable coding patterns such as separate files
for each portion of code that may need to be built with different
compiler flags.
This commit makes use of the newly-added support for
__attribute__((target(...))) in the SSE4.2 CRC-32C code. This
simplifies both the configure-time checks and the build scripts,
and it allows us to place the functions that use the intrinsics in
files that we otherwise do not want to build with special CPU
instructions (although this commit refrains from doing so). This
is also preparatory work for a proposed follow-up commit that will
further optimize the CRC-32C code with AVX-512 instructions.
While at it, this commit modifies meson's checks for SSE4.2 CRC
support to be the same as autoconf's. meson was choosing whether
to use a runtime check based purely on whether -msse4.2 is
required, while autoconf has long checked for the __SSE4_2__
preprocessor symbol to decide. meson's previous approach seems to
work just fine, but this change avoids needing to build multiple
test programs and to keep track of whether to actually use
pg_attribute_target().
Ideally we'd use __attribute__((target(...))) for ARMv8 CRC
support, too, but there's little point in doing so because until
clang 16, using the ARM intrinsics still requires special compiler
flags. Perhaps we can re-evaluate this decision after some time
has passed.
Author: Raghuveer Devulapalli
Discussion: https://postgr.es/m/PH8PR11MB8286BE735A463468415D46B5FB5C2%40PH8PR11MB8286.namprd11.prod.outlook.com
7 months ago
|
|
|
[Ac_cachevar=no])])
|
Use Intel SSE 4.2 CRC instructions where available.
Modern x86 and x86-64 processors with SSE 4.2 support have special
instructions, crc32b and crc32q, for calculating CRC-32C. They greatly
speed up CRC calculation.
Whether the instructions can be used or not depends on the compiler and the
target architecture. If generation of SSE 4.2 instructions is allowed for
the target (-msse4.2 flag on gcc and clang), use them. If they are not
allowed by default, but the compiler supports the -msse4.2 flag to enable
them, compile just the CRC-32C function with -msse4.2 flag, and check at
runtime whether the processor we're running on supports it. If it doesn't,
fall back to the slicing-by-8 algorithm. (With the common defaults on
current operating systems, the runtime-check variant is what you get in
practice.)
Abhijit Menon-Sen, heavily modified by me, reviewed by Andres Freund.
10 years ago
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_sse42_crc32_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_SSE42_CRC32_INTRINSICS
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
|
Compute CRC32C using AVX-512 instructions where available
The previous implementation of CRC32C on x86 relied on the native
CRC32 instruction from the SSE 4.2 extension, which operates on
up to 8 bytes at a time. We can get a substantial speedup by using
carryless multiplication on SIMD registers, processing 64 bytes per
loop iteration. Shorter inputs fall back to ordinary CRC instructions.
On Intel Tiger Lake hardware (2020), CRC is now 50% faster for inputs
between 64 and 112 bytes, and 3x faster for 256 bytes.
The VPCLMULQDQ instruction on 512-bit registers has been available
on Intel hardware since 2019 and AMD since 2022. There is an older
variant for 128-bit registers, but at least on Zen 2 it performs worse
than normal CRC instructions for short inputs.
We must now do a runtime check, even for builds that target SSE
4.2. This doesn't matter in practice for WAL (arguably the most
critical case), because since commit e2809e3a1 the final computation
with the 20-byte WAL header is inlined and unrolled when targeting
that extension. Compared with two direct function calls, testing
showed equal or slightly faster performance in performing an indirect
function call on several dozen bytes followed by inlined instructions
on constant input of 20 bytes.
The MIT-licensed implementation was generated with the "generate"
program from
https://github.com/corsix/fast-crc32/
Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009
Co-authored-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Co-authored-by: Paul Amonson <paul.d.amonson@intel.com>
Reviewed-by: Nathan Bossart <nathandbossart@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version)
Reviewed-by: Matthew Sterrett <matthewsterrett2@gmail.com> (earlier version)
Tested-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Tested-by: David Rowley <<dgrowleyml@gmail.com>> (earlier version)
Discussion: https://postgr.es/m/BL1PR11MB530401FA7E9B1CA432CF9DC3DC192@BL1PR11MB5304.namprd11.prod.outlook.com
Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
3 months ago
|
|
|
# PGAC_AVX512_PCLMUL_INTRINSICS
|
|
|
|
# ---------------------------
|
|
|
|
# Check if the compiler supports AVX-512 carryless multiplication
|
|
|
|
# and three-way exclusive-or instructions used for computing CRC.
|
|
|
|
# AVX-512F is assumed to be supported if the above are.
|
|
|
|
#
|
|
|
|
# If the intrinsics are supported, sets pgac_avx512_pclmul_intrinsics.
|
|
|
|
AC_DEFUN([PGAC_AVX512_PCLMUL_INTRINSICS],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_pclmul_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar],
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
|
|
|
|
__m512i x;
|
|
|
|
__m512i y;
|
|
|
|
|
|
|
|
#if defined(__has_attribute) && __has_attribute (target)
|
|
|
|
__attribute__((target("vpclmulqdq,avx512vl")))
|
|
|
|
#endif
|
|
|
|
static int avx512_pclmul_test(void)
|
|
|
|
{
|
|
|
|
__m128i z;
|
|
|
|
|
|
|
|
y = _mm512_clmulepi64_epi128(x, y, 0);
|
|
|
|
z = _mm_ternarylogic_epi64(
|
|
|
|
_mm512_castsi512_si128(y),
|
|
|
|
_mm512_extracti32x4_epi32(y, 1),
|
|
|
|
_mm512_extracti32x4_epi32(y, 2),
|
|
|
|
0x96);
|
|
|
|
return _mm_crc32_u64(0, _mm_extract_epi64(z, 0));
|
|
|
|
}],
|
|
|
|
[return avx512_pclmul_test();])],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_avx512_pclmul_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_AVX512_PCLMUL_INTRINSICS
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
|
|
|
|
# PGAC_ARMV8_CRC32C_INTRINSICS
|
|
|
|
# ----------------------------
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
# Check if the compiler supports the CRC32C instructions using the __crc32cb,
|
|
|
|
# __crc32ch, __crc32cw, and __crc32cd intrinsic functions. These instructions
|
|
|
|
# were first introduced in ARMv8 in the optional CRC Extension, and became
|
|
|
|
# mandatory in ARMv8.1.
|
|
|
|
#
|
|
|
|
# An optional compiler flag can be passed as argument (e.g.
|
|
|
|
# -march=armv8-a+crc). If the intrinsics are supported, sets
|
|
|
|
# pgac_armv8_crc32c_intrinsics, and CFLAGS_CRC.
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
AC_DEFUN([PGAC_ARMV8_CRC32C_INTRINSICS],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_armv8_crc32c_intrinsics_$1])])dnl
|
|
|
|
AC_CACHE_CHECK([for __crc32cb, __crc32ch, __crc32cw, and __crc32cd with CFLAGS=$1], [Ac_cachevar],
|
|
|
|
[pgac_save_CFLAGS=$CFLAGS
|
|
|
|
CFLAGS="$pgac_save_CFLAGS $1"
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
|
|
|
|
unsigned int crc;],
|
|
|
|
[crc = __crc32cb(crc, 0);
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
crc = __crc32ch(crc, 0);
|
|
|
|
crc = __crc32cw(crc, 0);
|
|
|
|
crc = __crc32cd(crc, 0);
|
|
|
|
/* return computed value, to prevent the above being optimized away */
|
|
|
|
return crc == 0;])],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])
|
|
|
|
CFLAGS="$pgac_save_CFLAGS"])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
CFLAGS_CRC="$1"
|
Use ARMv8 CRC instructions where available.
ARMv8 introduced special CPU instructions for calculating CRC-32C. Use
them, when available, for speed.
Like with the similar Intel CRC instructions, several factors affect
whether the instructions can be used. The compiler intrinsics for them must
be supported by the compiler, and the instructions must be supported by the
target architecture. If the compilation target architecture does not
support the instructions, but adding "-march=armv8-a+crc" makes them
available, then we compile the code with a runtime check to determine if
the host we're running on supports them or not.
For the runtime check, use glibc getauxval() function. Unfortunately,
that's not very portable, but I couldn't find any more portable way to do
it. If getauxval() is not available, the CRC instructions will still be
used if the target architecture supports them without any additional
compiler flags, but the runtime check will not be available.
Original patch by Yuqi Gu, heavily modified by me. Reviewed by Andres
Freund, Thomas Munro.
Discussion: https://www.postgresql.org/message-id/HE1PR0801MB1323D171938EABC04FFE7FA9E3110%40HE1PR0801MB1323.eurprd08.prod.outlook.com
7 years ago
|
|
|
pgac_armv8_crc32c_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_ARMV8_CRC32C_INTRINSICS
|
|
|
|
|
|
|
|
# PGAC_LOONGARCH_CRC32C_INTRINSICS
|
|
|
|
# ---------------------------
|
|
|
|
# Check if the compiler supports the LoongArch CRCC instructions, using
|
|
|
|
# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
|
|
|
|
# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
|
|
|
|
# intrinsic functions.
|
|
|
|
#
|
|
|
|
# We test for the 8-byte variant since platforms capable of running
|
|
|
|
# Postgres are 64-bit only (as of PG17), and we know CRC instructions
|
|
|
|
# are available there without a runtime check.
|
|
|
|
#
|
|
|
|
# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
|
|
|
|
AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK(
|
|
|
|
[for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
|
|
|
|
[Ac_cachevar],
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([unsigned int crc;],
|
|
|
|
[crc = __builtin_loongarch_crcc_w_b_w(0, crc);
|
|
|
|
crc = __builtin_loongarch_crcc_w_h_w(0, crc);
|
|
|
|
crc = __builtin_loongarch_crcc_w_w_w(0, crc);
|
|
|
|
crc = __builtin_loongarch_crcc_w_d_w(0, crc);
|
|
|
|
/* return computed value, to prevent the above being optimized away */
|
|
|
|
return crc == 0;])],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_loongarch_crc32c_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
|
|
|
|
# PGAC_XSAVE_INTRINSICS
|
|
|
|
# ---------------------
|
|
|
|
# Check if the compiler supports the XSAVE instructions using the _xgetbv
|
|
|
|
# intrinsic function.
|
|
|
|
#
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
# If the intrinsics are supported, sets pgac_xsave_intrinsics.
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
AC_DEFUN([PGAC_XSAVE_INTRINSICS],
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK([for _xgetbv], [Ac_cachevar],
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
|
|
|
|
#if defined(__has_attribute) && __has_attribute (target)
|
|
|
|
__attribute__((target("xsave")))
|
|
|
|
#endif
|
|
|
|
static int xsave_test(void)
|
|
|
|
{
|
|
|
|
return _xgetbv(0) & 0xe0;
|
|
|
|
}],
|
|
|
|
[return xsave_test();])],
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
[Ac_cachevar=yes],
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
[Ac_cachevar=no])])
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_xsave_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_XSAVE_INTRINSICS
|
|
|
|
|
|
|
|
# PGAC_AVX512_POPCNT_INTRINSICS
|
|
|
|
# -----------------------------
|
|
|
|
# Check if the compiler supports the AVX-512 popcount instructions using the
|
|
|
|
# _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
|
|
|
|
# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
|
|
|
|
#
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
# If the intrinsics are supported, sets pgac_avx512_popcnt_intrinsics.
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK([for _mm512_popcnt_epi64], [Ac_cachevar],
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <immintrin.h>
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
#include <stdint.h>
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
char buf[sizeof(__m512i)];
|
|
|
|
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
#if defined(__has_attribute) && __has_attribute (target)
|
|
|
|
__attribute__((target("avx512vpopcntdq,avx512bw")))
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
#endif
|
|
|
|
static int popcount_test(void)
|
|
|
|
{
|
Use <stdint.h> and <inttypes.h> for c.h integers.
Redefine our exact width types with standard C99 types and macros,
including int64_t, INT64_MAX, INT64_C(), PRId64 etc. We were already
using <stdint.h> types in a few places.
One complication is that Windows' <inttypes.h> uses format strings like
"%I64d", "%I32", "%I" for PRI*64, PRI*32, PTR*PTR, instead of mapping to
other standardized format strings like "%lld" etc as seen on other known
systems. Teach our snprintf.c to understand them.
This removes a lot of configure clutter, and should also allow 64-bit
numbers and other standard types to be used in localized messages
without casting.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/ME3P282MB3166F9D1F71F787929C0C7E7B6312%40ME3P282MB3166.AUSP282.PROD.OUTLOOK.COM
7 months ago
|
|
|
int64_t popcnt = 0;
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
__m512i accum = _mm512_setzero_si512();
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
__m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
|
|
|
|
__m512i cnt = _mm512_popcnt_epi64(val);
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
accum = _mm512_add_epi64(accum, cnt);
|
|
|
|
popcnt = _mm512_reduce_add_epi64(accum);
|
|
|
|
return (int) popcnt;
|
Be more paranoid in configure's checks for CRC and POPCNT intrinsics.
In these tests, we need to verify not only that the compiler has heard
of these intrinsics, but that lower-level tools cope with them too.
(For example, the assembler must also know the instructions, and on
some platforms there might be library support involved.) The hazard
is that the compiler might optimize away the calls altogether,
allowing the configure check to succeed only to have the build fail
later if lower-level support is missing. The existing code tried to
prevent that by ensuring that the result of the intrinsic is used
for something, but that's really insufficient because we were feeding
constant input to it. So the compiler would be perfectly entitled to
optimize away the calls anyway. Fix by making the inputs into global
variables. (Hypothetically, LTO optimization could still remove the
code --- but that's well past where we'd be likely to hit trouble.)
It is not known that any current compiler would actually optimize
away these calls, and even if that happened it would be unlikely
that any problem would manifest. Our concern for this stems from
largely-bygone days when it was common to install gcc on platforms
with some other native compiler, so that a compiler-vs-library
support discrepancy was more probable. Still, there's little
point in defending against such cases in a way that is visibly
incomplete.
I'm content to fix this in master for now; we can back-patch if
any indication appears that it's a live problem for someone.
Discussion: https://postgr.es/m/3368102.1741993462@sss.pgh.pa.us
3 months ago
|
|
|
}]],
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
[return popcount_test();])],
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
[Ac_cachevar=yes],
|
Use __attribute__((target(...))) for AVX-512 support.
Presently, we check for compiler support for the required
intrinsics both with and without extra compiler flags (e.g.,
-mxsave), and then depending on the results of those checks, we
pick which files to compile with which flags. This is tedious and
complicated, and it results in unsustainable coding patterns such
as separate files for each portion of code may need to be built
with different compiler flags.
This commit introduces support for __attribute__((target(...))) and
uses it for the AVX-512 code. This simplifies both the
configure-time checks and the build scripts, and it allows us to
place the functions that use the intrinsics in files that we
otherwise do not want to build with special CPU instructions. We
are careful to avoid using __attribute__((target(...))) on
compilers that do not understand it, but we still perform the
configure-time checks in case the compiler allows using the
intrinsics without it (e.g., MSVC).
A similar change could likely be made for some of the CRC-32C code,
but that is left as a future exercise.
Suggested-by: Andres Freund
Reviewed-by: Raghuveer Devulapalli, Andres Freund
Discussion: https://postgr.es/m/20240731205254.vfpap7uxwmebqeaf%40awork3.anarazel.de
8 months ago
|
|
|
[Ac_cachevar=no])])
|
Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks
when possible. Newer hardware that supports AVX-512 instructions
can use 512-bit chunks, which provides a nice speedup, especially
for larger buffers. This commit introduces the infrastructure
required to detect compiler and CPU support for the required
AVX-512 intrinsic functions, and it adds a new pg_popcount()
implementation that uses these functions. If CPU support for this
optimized implementation is detected at runtime, a function pointer
is updated so that it is used by subsequent calls to pg_popcount().
Most of the existing in-tree calls to pg_popcount() should benefit
from these instructions, and calls with smaller buffers should at
least not regress compared to v16. The new infrastructure
introduced by this commit can also be used to optimize
visibilitymap_count(), but that is left for a follow-up commit.
Co-authored-by: Paul Amonson, Ants Aasma
Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley
Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
1 year ago
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_avx512_popcnt_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_AVX512_POPCNT_INTRINSICS
|
|
|
|
|
|
|
|
# PGAC_SVE_POPCNT_INTRINSICS
|
|
|
|
# --------------------------
|
|
|
|
# Check if the compiler supports the SVE popcount instructions using the
|
|
|
|
# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
|
|
|
|
# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
|
|
|
|
# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
|
|
|
|
#
|
|
|
|
# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
|
|
|
|
AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
|
|
|
|
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
|
|
|
|
AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
|
|
|
|
[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
|
|
|
|
|
|
|
|
char buf[128];
|
|
|
|
|
|
|
|
#if defined(__has_attribute) && __has_attribute (target)
|
|
|
|
__attribute__((target("arch=armv8-a+sve")))
|
|
|
|
#endif
|
|
|
|
static int popcount_test(void)
|
|
|
|
{
|
|
|
|
svbool_t pred = svptrue_b64();
|
|
|
|
svuint8_t vec8;
|
|
|
|
svuint64_t accum1 = svdup_u64(0),
|
|
|
|
accum2 = svdup_u64(0),
|
|
|
|
vec64;
|
|
|
|
char *p = buf;
|
|
|
|
uint64_t popcnt,
|
|
|
|
mask = 0x5555555555555555;
|
|
|
|
|
|
|
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
|
|
|
accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
|
|
|
|
p += svcntb();
|
|
|
|
|
|
|
|
vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
|
|
|
|
accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
|
|
|
|
p += svcntb();
|
|
|
|
|
|
|
|
popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
|
|
|
|
|
|
|
|
pred = svwhilelt_b8_s32(0, sizeof(buf));
|
|
|
|
vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
|
|
|
|
return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
|
|
|
|
}]],
|
|
|
|
[return popcount_test();])],
|
|
|
|
[Ac_cachevar=yes],
|
|
|
|
[Ac_cachevar=no])])
|
|
|
|
if test x"$Ac_cachevar" = x"yes"; then
|
|
|
|
pgac_sve_popcnt_intrinsics=yes
|
|
|
|
fi
|
|
|
|
undefine([Ac_cachevar])dnl
|
|
|
|
])# PGAC_SVE_POPCNT_INTRINSICS
|