Add a basic atomic ops API abstracting away platform/architecture details.

Several upcoming performance/scalability improvements require atomic operations. This new API avoids the need to splatter compiler and architecture dependent code over all the locations employing atomic ops. For several of the potential usages it'd be problematic to maintain both, a atomics using implementation and one using spinlocks or similar. In all likelihood one of the implementations would not get tested regularly under concurrency. To avoid that scenario the new API provides a automatic fallback of atomic operations to spinlocks. All properties of atomic operations are maintained. This fallback - obviously - isn't as fast as just using atomic ops, but it's not bad either. For one of the future users the atomics ontop spinlocks implementation was actually slightly faster than the old purely spinlock using implementation. That's important because it reduces the fear of regressing older platforms when improving the scalability for new ones. The API, loosely modeled after the C11 atomics support, currently provides 'atomic flags' and 32 bit unsigned integers. If the platform efficiently supports atomic 64 bit unsigned integers those are also provided. To implement atomics support for a platform/architecture/compiler for a type of atomics 32bit compare and exchange needs to be implemented. If available and more efficient native support for flags, 32 bit atomic addition, and corresponding 64 bit operations may also be provided. Additional useful atomic operations are implemented generically ontop of these. The implementation for various versions of gcc, msvc and sun studio have been tested. Additional existing stub implementations for * Intel icc * HUPX acc * IBM xlc are included but have never been tested. These will likely require fixes based on buildfarm and user feedback. As atomic operations also require barriers for some operations the existing barrier support has been moved into the atomics code. Author: Andres Freund with contributions from Oskari Saarenmaa Reviewed-By: Amit Kapila, Robert Haas, Heikki Linnakangas and Álvaro Herrera Discussion: CA+TgmoYBW+ux5-8Ja=Mcyuy8=VXAnVRHp3Kess6Pn3DMXAPAEA@mail.gmail.com, 20131015123303.GH5300@awork2.anarazel.de, 20131028205522.GI20248@awork2.anarazel.de
2014-09-25 23:49:05 +02:00 · 2014-09-25 23:49:05 +02:00 · b64d92f1a5
parent 9111d46351
commit b64d92f1a5
31 changed files with 2816 additions and 208 deletions
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@ -300,3 +300,101 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_PROG_CC_LDFLAGS_OPT
+
+# PGAC_HAVE_GCC__SYNC_CHAR_TAS
+# -------------------------
+# Check if the C compiler understands __sync_lock_test_and_set(char),
+# and define HAVE_GCC__SYNC_CHAR_TAS
+#
+# NB: There are platforms where test_and_set is available but compare_and_swap
+# is not, so test this separately.
+# NB: Some platforms only do 32bit tas, others only do 8bit tas. Test both.
+AC_DEFUN([PGAC_HAVE_GCC__SYNC_CHAR_TAS],
+[AC_CACHE_CHECK(for builtin __sync char locking functions, pgac_cv_gcc_sync_char_tas,
+[AC_TRY_LINK([],
+  [char lock = 0;
+   __sync_lock_test_and_set(&lock, 1);
+   __sync_lock_release(&lock);],
+  [pgac_cv_gcc_sync_char_tas="yes"],
+  [pgac_cv_gcc_sync_char_tas="no"])])
+if test x"$pgac_cv_gcc_sync_char_tas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__SYNC_CHAR_TAS, 1, [Define to 1 if you have __sync_lock_test_and_set(char *) and friends.])
+fi])# PGAC_HAVE_GCC__SYNC_CHAR_TAS
+
+# PGAC_HAVE_GCC__SYNC_INT32_TAS
+# -------------------------
+# Check if the C compiler understands __sync_lock_test_and_set(),
+# and define HAVE_GCC__SYNC_INT32_TAS
+AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT32_TAS],
+[AC_CACHE_CHECK(for builtin __sync int32 locking functions, pgac_cv_gcc_sync_int32_tas,
+[AC_TRY_LINK([],
+  [int lock = 0;
+   __sync_lock_test_and_set(&lock, 1);
+   __sync_lock_release(&lock);],
+  [pgac_cv_gcc_sync_int32_tas="yes"],
+  [pgac_cv_gcc_sync_int32_tas="no"])])
+if test x"$pgac_cv_gcc_sync_int32_tas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__SYNC_INT32_TAS, 1, [Define to 1 if you have __sync_lock_test_and_set(int *) and friends.])
+fi])# PGAC_HAVE_GCC__SYNC_INT32_TAS
+
+# PGAC_HAVE_GCC__SYNC_INT32_CAS
+# -------------------------
+# Check if the C compiler understands __sync_compare_and_swap() for 32bit
+# types, and define HAVE_GCC__SYNC_INT32_CAS if so.
+AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT32_CAS],
+[AC_CACHE_CHECK(for builtin __sync int32 atomic operations, pgac_cv_gcc_sync_int32_cas,
+[AC_TRY_LINK([],
+  [int val = 0;
+   __sync_val_compare_and_swap(&val, 0, 37);],
+  [pgac_cv_gcc_sync_int32_cas="yes"],
+  [pgac_cv_gcc_sync_int32_cas="no"])])
+if test x"$pgac_cv_gcc_sync_int32_cas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__SYNC_INT32_CAS, 1, [Define to 1 if you have __sync_compare_and_swap(int *, int, int).])
+fi])# PGAC_HAVE_GCC__SYNC_INT32_CAS
+
+# PGAC_HAVE_GCC__SYNC_INT64_CAS
+# -------------------------
+# Check if the C compiler understands __sync_compare_and_swap() for 64bit
+# types, and define HAVE_GCC__SYNC_INT64_CAS if so.
+AC_DEFUN([PGAC_HAVE_GCC__SYNC_INT64_CAS],
+[AC_CACHE_CHECK(for builtin __sync int64 atomic operations, pgac_cv_gcc_sync_int64_cas,
+[AC_TRY_LINK([],
+  [PG_INT64_TYPE lock = 0;
+   __sync_val_compare_and_swap(&lock, 0, (PG_INT64_TYPE) 37);],
+  [pgac_cv_gcc_sync_int64_cas="yes"],
+  [pgac_cv_gcc_sync_int64_cas="no"])])
+if test x"$pgac_cv_gcc_sync_int64_cas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__SYNC_INT64_CAS, 1, [Define to 1 if you have __sync_compare_and_swap(int64 *, int64, int64).])
+fi])# PGAC_HAVE_GCC__SYNC_INT64_CAS
+
+# PGAC_HAVE_GCC__ATOMIC_INT32_CAS
+# -------------------------
+# Check if the C compiler understands __atomic_compare_exchange_n() for 32bit
+# types, and define HAVE_GCC__ATOMIC_INT32_CAS if so.
+AC_DEFUN([PGAC_HAVE_GCC__ATOMIC_INT32_CAS],
+[AC_CACHE_CHECK(for builtin __atomic int32 atomic operations, pgac_cv_gcc_atomic_int32_cas,
+[AC_TRY_LINK([],
+  [int val = 0;
+   int expect = 0;
+   __atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);],
+  [pgac_cv_gcc_atomic_int32_cas="yes"],
+  [pgac_cv_gcc_atomic_int32_cas="no"])])
+if test x"$pgac_cv_gcc_atomic_int32_cas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__ATOMIC_INT32_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int *, int *, int).])
+fi])# PGAC_HAVE_GCC__ATOMIC_INT32_CAS
+
+# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
+# -------------------------
+# Check if the C compiler understands __atomic_compare_exchange_n() for 64bit
+# types, and define HAVE_GCC__ATOMIC_INT64_CAS if so.
+AC_DEFUN([PGAC_HAVE_GCC__ATOMIC_INT64_CAS],
+[AC_CACHE_CHECK(for builtin __atomic int64 atomic operations, pgac_cv_gcc_atomic_int64_cas,
+[AC_TRY_LINK([],
+  [PG_INT64_TYPE val = 0;
+   PG_INT64_TYPE expect = 0;
+   __atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);],
+  [pgac_cv_gcc_atomic_int64_cas="yes"],
+  [pgac_cv_gcc_atomic_int64_cas="no"])])
+if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
+  AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int *, int64).])
+fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
--- a/274
+++ b/274
@ -802,6 +802,7 @@ enable_nls
 with_pgport
 enable_rpath
 enable_spinlocks
+enable_atomics
 enable_debug
 enable_profiling
 enable_coverage
@ -1470,6 +1471,7 @@ Optional Features:
  --disable-rpath         do not embed shared library search path in
                          executables
  --disable-spinlocks     do not use spinlocks
+  --disable-atomics       do not use atomic operations
  --enable-debug          build with debugging symbols (-g)
  --enable-profiling      build with profiling enabled
  --enable-coverage       build with coverage testing instrumentation
@ -3145,6 +3147,33 @@ fi



+#
+# Atomic operations
+#
+
+
+# Check whether --enable-atomics was given.
+if test "${enable_atomics+set}" = set; then :
+  enableval=$enable_atomics;
+  case $enableval in
+    yes)
+      :
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --enable-atomics option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  enable_atomics=yes
+
+fi
+
+
+
 #
 # --enable-debug adds -g to compiler flags
 #
@ -8349,6 +8378,17 @@ $as_echo "$as_me: WARNING:
 *** Not using spinlocks will cause poor performance." >&2;}
 fi

+if test "$enable_atomics" = yes; then
+
+$as_echo "#define HAVE_ATOMICS 1" >>confdefs.h
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING:
+*** Not using atomic operations will cause poor performance." >&5
+$as_echo "$as_me: WARNING:
+*** Not using atomic operations will cause poor performance." >&2;}
+fi
+
 if test "$with_gssapi" = yes ; then
  if test "$PORTNAME" != "win32"; then
    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing gss_init_sec_context" >&5
@ -9123,7 +9163,7 @@ fi
 done


-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@ -12154,40 +12194,6 @@ fi
 done


-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin locking functions" >&5
-$as_echo_n "checking for builtin locking functions... " >&6; }
-if ${pgac_cv_gcc_int_atomics+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main ()
-{
-int lock = 0;
-   __sync_lock_test_and_set(&lock, 1);
-   __sync_lock_release(&lock);
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_gcc_int_atomics="yes"
-else
-  pgac_cv_gcc_int_atomics="no"
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_int_atomics" >&5
-$as_echo "$pgac_cv_gcc_int_atomics" >&6; }
-if test x"$pgac_cv_gcc_int_atomics" = x"yes"; then
-
-$as_echo "#define HAVE_GCC_INT_ATOMICS 1" >>confdefs.h
-
-fi
-
 # Lastly, restore full LIBS list and check for readline/libedit symbols
 LIBS="$LIBS_including_readline"

@ -13711,6 +13717,204 @@ _ACEOF
 fi


+# Check for various atomic operations now that we have checked how to declare
+# 64bit integers.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __sync char locking functions" >&5
+$as_echo_n "checking for builtin __sync char locking functions... " >&6; }
+if ${pgac_cv_gcc_sync_char_tas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+char lock = 0;
+   __sync_lock_test_and_set(&lock, 1);
+   __sync_lock_release(&lock);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_sync_char_tas="yes"
+else
+  pgac_cv_gcc_sync_char_tas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_sync_char_tas" >&5
+$as_echo "$pgac_cv_gcc_sync_char_tas" >&6; }
+if test x"$pgac_cv_gcc_sync_char_tas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__SYNC_CHAR_TAS 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __sync int32 locking functions" >&5
+$as_echo_n "checking for builtin __sync int32 locking functions... " >&6; }
+if ${pgac_cv_gcc_sync_int32_tas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+int lock = 0;
+   __sync_lock_test_and_set(&lock, 1);
+   __sync_lock_release(&lock);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_sync_int32_tas="yes"
+else
+  pgac_cv_gcc_sync_int32_tas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_sync_int32_tas" >&5
+$as_echo "$pgac_cv_gcc_sync_int32_tas" >&6; }
+if test x"$pgac_cv_gcc_sync_int32_tas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__SYNC_INT32_TAS 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __sync int32 atomic operations" >&5
+$as_echo_n "checking for builtin __sync int32 atomic operations... " >&6; }
+if ${pgac_cv_gcc_sync_int32_cas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+int val = 0;
+   __sync_val_compare_and_swap(&val, 0, 37);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_sync_int32_cas="yes"
+else
+  pgac_cv_gcc_sync_int32_cas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_sync_int32_cas" >&5
+$as_echo "$pgac_cv_gcc_sync_int32_cas" >&6; }
+if test x"$pgac_cv_gcc_sync_int32_cas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__SYNC_INT32_CAS 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __sync int64 atomic operations" >&5
+$as_echo_n "checking for builtin __sync int64 atomic operations... " >&6; }
+if ${pgac_cv_gcc_sync_int64_cas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+PG_INT64_TYPE lock = 0;
+   __sync_val_compare_and_swap(&lock, 0, (PG_INT64_TYPE) 37);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_sync_int64_cas="yes"
+else
+  pgac_cv_gcc_sync_int64_cas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_sync_int64_cas" >&5
+$as_echo "$pgac_cv_gcc_sync_int64_cas" >&6; }
+if test x"$pgac_cv_gcc_sync_int64_cas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__SYNC_INT64_CAS 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __atomic int32 atomic operations" >&5
+$as_echo_n "checking for builtin __atomic int32 atomic operations... " >&6; }
+if ${pgac_cv_gcc_atomic_int32_cas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+int val = 0;
+   int expect = 0;
+   __atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_atomic_int32_cas="yes"
+else
+  pgac_cv_gcc_atomic_int32_cas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_atomic_int32_cas" >&5
+$as_echo "$pgac_cv_gcc_atomic_int32_cas" >&6; }
+if test x"$pgac_cv_gcc_atomic_int32_cas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__ATOMIC_INT32_CAS 1" >>confdefs.h
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for builtin __atomic int64 atomic operations" >&5
+$as_echo_n "checking for builtin __atomic int64 atomic operations... " >&6; }
+if ${pgac_cv_gcc_atomic_int64_cas+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+PG_INT64_TYPE val = 0;
+   PG_INT64_TYPE expect = 0;
+   __atomic_compare_exchange_n(&val, &expect, 37, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_gcc_atomic_int64_cas="yes"
+else
+  pgac_cv_gcc_atomic_int64_cas="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_gcc_atomic_int64_cas" >&5
+$as_echo "$pgac_cv_gcc_atomic_int64_cas" >&6; }
+if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then
+
+$as_echo "#define HAVE_GCC__ATOMIC_INT64_CAS 1" >>confdefs.h
+
+fi

 if test "$PORTNAME" != "win32"
 then
--- a/configure.in
+++ b/configure.in
@ -178,6 +178,12 @@ AC_SUBST(enable_rpath)
 PGAC_ARG_BOOL(enable, spinlocks, yes,
              [do not use spinlocks])

+#
+# Atomic operations
+#
+PGAC_ARG_BOOL(enable, atomics, yes,
+              [do not use atomic operations])
+
 #
 # --enable-debug adds -g to compiler flags
 #
@ -936,6 +942,13 @@ else
 *** Not using spinlocks will cause poor performance.])
 fi

+if test "$enable_atomics" = yes; then
+  AC_DEFINE(HAVE_ATOMICS, 1, [Define to 1 if you want to use atomics if available.])
+else
+  AC_MSG_WARN([
+*** Not using atomic operations will cause poor performance.])
+fi
+
 if test "$with_gssapi" = yes ; then
  if test "$PORTNAME" != "win32"; then
    AC_SEARCH_LIBS(gss_init_sec_context, [gssapi_krb5 gss 'gssapi -lkrb5 -lcrypto'], [],
@ -1003,7 +1016,7 @@ AC_SUBST(UUID_LIBS)
 ##

 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])

 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
@ -1467,17 +1480,6 @@ fi
 AC_CHECK_FUNCS([strtoll strtoq], [break])
 AC_CHECK_FUNCS([strtoull strtouq], [break])

-AC_CACHE_CHECK([for builtin locking functions], pgac_cv_gcc_int_atomics,
-[AC_TRY_LINK([],
-  [int lock = 0;
-   __sync_lock_test_and_set(&lock, 1);
-   __sync_lock_release(&lock);],
-  [pgac_cv_gcc_int_atomics="yes"],
-  [pgac_cv_gcc_int_atomics="no"])])
-if test x"$pgac_cv_gcc_int_atomics" = x"yes"; then
-  AC_DEFINE(HAVE_GCC_INT_ATOMICS, 1, [Define to 1 if you have __sync_lock_test_and_set(int *) and friends.])
-fi
-
 # Lastly, restore full LIBS list and check for readline/libedit symbols
 LIBS="$LIBS_including_readline"

@ -1746,6 +1748,14 @@ AC_CHECK_TYPES([int8, uint8, int64, uint64], [], [],
 # C, but is missing on some old platforms.
 AC_CHECK_TYPES(sig_atomic_t, [], [], [#include <signal.h>])

+# Check for various atomic operations now that we have checked how to declare
+# 64bit integers.
+PGAC_HAVE_GCC__SYNC_CHAR_TAS
+PGAC_HAVE_GCC__SYNC_INT32_TAS
+PGAC_HAVE_GCC__SYNC_INT32_CAS
+PGAC_HAVE_GCC__SYNC_INT64_CAS
+PGAC_HAVE_GCC__ATOMIC_INT32_CAS
+PGAC_HAVE_GCC__ATOMIC_INT64_CAS

 if test "$PORTNAME" != "win32"
 then
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@ -30,6 +30,7 @@
 #include "common/username.h"
 #include "postmaster/postmaster.h"
 #include "storage/barrier.h"
+#include "storage/s_lock.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
--- a/src/backend/port/Makefile
+++ b/src/backend/port/Makefile
@ -21,7 +21,7 @@ subdir = src/backend/port
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global

-OBJS = dynloader.o pg_sema.o pg_shmem.o pg_latch.o $(TAS)
+OBJS = atomics.o dynloader.o pg_sema.o pg_shmem.o pg_latch.o $(TAS)

 ifeq ($(PORTNAME), darwin)
 SUBDIRS += darwin
--- a/src/backend/port/atomics.c
+++ b/src/backend/port/atomics.c
@ -0,0 +1,127 @@
+/*-------------------------------------------------------------------------
+ *
+ * atomics.c
+ *	   Non-Inline parts of the atomics implementation
+ *
+ * Portions Copyright (c) 2013-2014, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/atomics.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+/*
+ * We want the functions below to be inline; but if the compiler doesn't
+ * support that, fall back on providing them as regular functions.	See
+ * STATIC_IF_INLINE in c.h.
+ */
+#define ATOMICS_INCLUDE_DEFINITIONS
+
+#include "port/atomics.h"
+#include "storage/spin.h"
+
+#ifdef PG_HAVE_MEMORY_BARRIER_EMULATION
+void
+pg_spinlock_barrier(void)
+{
+	S_LOCK(&dummy_spinlock);
+	S_UNLOCK(&dummy_spinlock);
+}
+#endif
+
+
+#ifdef PG_HAVE_ATOMIC_FLAG_SIMULATION
+
+void
+pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t),
+					 "size mismatch of atomic_flag vs slock_t");
+
+#ifndef HAVE_SPINLOCKS
+	/*
+	 * NB: If we're using semaphore based TAS emulation, be careful to use a
+	 * separate set of semaphores. Otherwise we'd get in trouble if a atomic
+	 * var would be manipulated while spinlock is held.
+	 */
+	s_init_lock_sema((slock_t *) &ptr->sema, true);
+#else
+	SpinLockInit((slock_t *) &ptr->sema);
+#endif
+}
+
+bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	return TAS((slock_t *) &ptr->sema);
+}
+
+void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	S_UNLOCK((slock_t *) &ptr->sema);
+}
+
+#endif /* PG_HAVE_ATOMIC_FLAG_SIMULATION */
+
+#ifdef PG_HAVE_ATOMIC_U32_SIMULATION
+void
+pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_)
+{
+	StaticAssertStmt(sizeof(ptr->sema) >= sizeof(slock_t),
+					 "size mismatch of atomic_flag vs slock_t");
+
+	/*
+	 * If we're using semaphore based atomic flags, be careful about nested
+	 * usage of atomics while a spinlock is held.
+	 */
+#ifndef HAVE_SPINLOCKS
+	s_init_lock_sema((slock_t *) &ptr->sema, true);
+#else
+	SpinLockInit((slock_t *) &ptr->sema);
+#endif
+	ptr->value = val_;
+}
+
+bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool ret;
+	/*
+	 * Do atomic op under a spinlock. It might look like we could just skip
+	 * the cmpxchg if the lock isn't available, but that'd just emulate a
+	 * 'weak' compare and swap. I.e. one that allows spurious failures. Since
+	 * several algorithms rely on a strong variant and that is efficiently
+	 * implementable on most major architectures let's emulate it here as
+	 * well.
+	 */
+	SpinLockAcquire((slock_t *) &ptr->sema);
+
+	/* perform compare/exchange logic*/
+	ret = ptr->value == *expected;
+	*expected = ptr->value;
+	if (ret)
+		ptr->value = newval;
+
+	/* and release lock */
+	SpinLockRelease((slock_t *) &ptr->sema);
+
+	return ret;
+}
+
+uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	uint32 oldval;
+	SpinLockAcquire((slock_t *) &ptr->sema);
+	oldval = ptr->value;
+	ptr->value += add_;
+	SpinLockRelease((slock_t *) &ptr->sema);
+	return oldval;
+}
+
+#endif /* PG_HAVE_ATOMIC_U32_SIMULATION */
--- a/src/backend/storage/lmgr/spin.c
+++ b/src/backend/storage/lmgr/spin.c
@ -67,7 +67,7 @@ SpinlockSemas(void)
 int
 SpinlockSemas(void)
 {
-	return NUM_SPINLOCK_SEMAPHORES;
+	return NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES;
 }

 /*
@ -77,8 +77,9 @@ extern void
 SpinlockSemaInit(PGSemaphore spinsemas)
 {
 	int			i;
+	int			nsemas = SpinlockSemas();

-	for (i = 0; i < NUM_SPINLOCK_SEMAPHORES; ++i)
+	for (i = 0; i < nsemas; ++i)
 		PGSemaphoreCreate(&spinsemas[i]);
 	SpinlockSemaArray = spinsemas;
 }
@ -88,7 +89,7 @@ SpinlockSemaInit(PGSemaphore spinsemas)
 */

 void
-s_init_lock_sema(volatile slock_t *lock)
+s_init_lock_sema(volatile slock_t *lock, bool nested)
 {
 	static int	counter = 0;

--- a/src/include/c.h
+++ b/src/include/c.h
@ -582,6 +582,7 @@ typedef NameData *Name;
 #define AssertMacro(condition)	((void)true)
 #define AssertArg(condition)
 #define AssertState(condition)
+#define AssertPointerAlignment(ptr, bndr)	((void)true)
 #define Trap(condition, errorType)
 #define TrapMacro(condition, errorType) (true)

@ -592,6 +593,7 @@ typedef NameData *Name;
 #define AssertMacro(p)	((void) assert(p))
 #define AssertArg(condition) assert(condition)
 #define AssertState(condition) assert(condition)
+#define AssertPointerAlignment(ptr, bndr)	((void)true)
 #else							/* USE_ASSERT_CHECKING && !FRONTEND */

 /*
@ -628,8 +630,15 @@ typedef NameData *Name;

 #define AssertState(condition) \
 		Trap(!(condition), "BadState")
-#endif   /* USE_ASSERT_CHECKING && !FRONTEND */

+/*
+ * Check that `ptr' is `bndr' aligned.
+ */
+#define AssertPointerAlignment(ptr, bndr) \
+	Trap(TYPEALIGN(bndr, (uintptr_t)(ptr)) != (uintptr_t)(ptr), \
+		 "UnalignedPointer")
+
+#endif   /* USE_ASSERT_CHECKING && !FRONTEND */

 /*
 * Macros to support compile-time assertion checks.
@ -856,12 +865,22 @@ typedef NameData *Name;
 * The header must also declare the functions' prototypes, protected by
 * !PG_USE_INLINE.
 */
+
+/* declarations which are only visible when not inlining and in the .c file */
 #ifdef PG_USE_INLINE
 #define STATIC_IF_INLINE static inline
 #else
 #define STATIC_IF_INLINE
 #endif   /* PG_USE_INLINE */

+/* declarations which are marked inline when inlining, extern otherwise */
+#ifdef PG_USE_INLINE
+#define STATIC_IF_INLINE_DECLARE static inline
+#else
+#define STATIC_IF_INLINE_DECLARE extern
+#endif   /* PG_USE_INLINE */
+
+
 /* ----------------------------------------------------------------
 *				Section 8:	random stuff
 * ----------------------------------------------------------------
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@ -87,6 +87,12 @@
 /* Define to 1 if you have the `append_history' function. */
 #undef HAVE_APPEND_HISTORY

+/* Define to 1 if you want to use atomics if available. */
+#undef HAVE_ATOMICS
+
+/* Define to 1 if you have the <atomic.h> header file. */
+#undef HAVE_ATOMIC_H
+
 /* Define to 1 if you have the `cbrt' function. */
 #undef HAVE_CBRT

@ -173,8 +179,24 @@
 /* Define to 1 if your compiler understands __FUNCTION__. */
 #undef HAVE_FUNCNAME__FUNCTION

+/* Define to 1 if you have __atomic_compare_exchange_n(int *, int *, int). */
+#undef HAVE_GCC__ATOMIC_INT32_CAS
+
+/* Define to 1 if you have __atomic_compare_exchange_n(int64 *, int *, int64).
+   */
+#undef HAVE_GCC__ATOMIC_INT64_CAS
+
+/* Define to 1 if you have __sync_lock_test_and_set(char *) and friends. */
+#undef HAVE_GCC__SYNC_CHAR_TAS
+
+/* Define to 1 if you have __sync_compare_and_swap(int *, int, int). */
+#undef HAVE_GCC__SYNC_INT32_CAS
+
 /* Define to 1 if you have __sync_lock_test_and_set(int *) and friends. */
-#undef HAVE_GCC_INT_ATOMICS
+#undef HAVE_GCC__SYNC_INT32_TAS
+
+/* Define to 1 if you have __sync_compare_and_swap(int64 *, int64, int64). */
+#undef HAVE_GCC__SYNC_INT64_CAS

 /* Define to 1 if you have the `getaddrinfo' function. */
 #undef HAVE_GETADDRINFO
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@ -334,6 +334,9 @@
 /* Define to 1 if you have spinlocks. */
 #define HAVE_SPINLOCKS 1

+/* Define to 1 if you have atomics. */
+#define HAVE_ATOMICS 1
+
 /* Define to 1 if you have the `srandom' function. */
 /* #undef HAVE_SRANDOM */

--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@ -64,6 +64,14 @@
 */
 #define NUM_SPINLOCK_SEMAPHORES		1024

+/*
+ * When we have neither spinlocks nor atomic operations support we're
+ * implementing atomic operations on top of spinlock on top of semaphores. To
+ * be safe against atomic operations while holding a spinlock separate
+ * semaphores have to be used.
+ */
+#define NUM_ATOMICS_SEMAPHORES		64
+
 /*
 * Define this if you want to allow the lo_import and lo_export SQL
 * functions to be executed by ordinary users.  By default these
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@ -0,0 +1,531 @@
+/*-------------------------------------------------------------------------
+ *
+ * atomics.h
+ *	  Atomic operations.
+ *
+ * Hardware and compiler dependent functions for manipulating memory
+ * atomically and dealing with cache coherency. Used to implement locking
+ * facilities and lockless algorithms/data structures.
+ *
+ * To bring up postgres on a platform/compiler at the very least
+ * implementations for the following operations should be provided:
+ * * pg_compiler_barrier(), pg_write_barrier(), pg_read_barrier()
+ * * pg_atomic_compare_exchange_u32(), pg_atomic_fetch_add_u32()
+ * * pg_atomic_test_set_flag(), pg_atomic_init_flag(), pg_atomic_clear_flag()
+ *
+ * There exist generic, hardware independent, implementations for several
+ * compilers which might be sufficient, although possibly not optimal, for a
+ * new platform. If no such generic implementation is available spinlocks (or
+ * even OS provided semaphores) will be used to implement the API.
+ *
+ * Implement the _u64 variantes if and only if your platform can use them
+ * efficiently (and obviously correctly).
+ *
+ * Use higher level functionality (lwlocks, spinlocks, heavyweight locks)
+ * whenever possible. Writing correct code using these facilities is hard.
+ *
+ * For an introduction to using memory barriers within the PostgreSQL backend,
+ * see src/backend/storage/lmgr/README.barrier
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/atomics.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef ATOMICS_H
+#define ATOMICS_H
+
+#define INSIDE_ATOMICS_H
+
+#include <limits.h>
+
+/*
+ * First a set of architecture specific files is included.
+ *
+ * These files can provide the full set of atomics or can do pretty much
+ * nothing if all the compilers commonly used on these platforms provide
+ * useable generics.
+ *
+ * Don't add an inline assembly of the actual atomic operations if all the
+ * common implementations of your platform provide intrinsics. Intrinsics are
+ * much easier to understand and potentially support more architectures.
+ *
+ * It will often make sense to define memory barrier semantics here, since
+ * e.g. generic compiler intrinsics for x86 memory barriers can't know that
+ * postgres doesn't need x86 read/write barriers do anything more than a
+ * compiler barrier.
+ *
+ */
+#if defined(__arm__) || defined(__arm) || \
+	defined(__aarch64__) || defined(__aarch64)
+#	include "port/atomics/arch-arm.h"
+#elif defined(__i386__) || defined(__i386) || defined(__x86_64__)
+#	include "port/atomics/arch-x86.h"
+#elif defined(__ia64__) || defined(__ia64)
+#	include "port/atomics/arch-ia64.h"
+#elif defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
+#	include "port/atomics/arch-ppc.h"
+#elif defined(__hppa) || defined(__hppa__)
+#	include "port/atomics/arch-hppa.h"
+#endif
+
+/*
+ * Compiler specific, but architecture independent implementations.
+ *
+ * Provide architecture independent implementations of the atomic
+ * facilities. At the very least compiler barriers should be provided, but a
+ * full implementation of
+ * * pg_compiler_barrier(), pg_write_barrier(), pg_read_barrier()
+ * * pg_atomic_compare_exchange_u32(), pg_atomic_fetch_add_u32()
+ * using compiler intrinsics are a good idea.
+ */
+/* gcc or compatible, including clang and icc */
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#	include "port/atomics/generic-gcc.h"
+#elif defined(WIN32_ONLY_COMPILER)
+#	include "port/atomics/generic-msvc.h"
+#elif defined(__hpux) && defined(__ia64) && !defined(__GNUC__)
+#	include "port/atomics/generic-acc.h"
+#elif defined(__SUNPRO_C) && !defined(__GNUC__)
+#	include "port/atomics/generic-sunpro.h"
+#elif (defined(__IBMC__) || defined(__IBMCPP__)) && !defined(__GNUC__)
+#	include "port/atomics/generic-xlc.h"
+#else
+/*
+ * Unsupported compiler, we'll likely use slower fallbacks... At least
+ * compiler barriers should really be provided.
+ */
+#endif
+
+/*
+ * Provide a full fallback of the pg_*_barrier(), pg_atomic**_flag and
+ * pg_atomic_*_u32 APIs for platforms without sufficient spinlock and/or
+ * atomics support. In the case of spinlock backed atomics the emulation is
+ * expected to be efficient, although less so than native atomics support.
+ */
+#include "port/atomics/fallback.h"
+
+/*
+ * Provide additional operations using supported infrastructure. These are
+ * expected to be efficient if the underlying atomic operations are efficient.
+ */
+#include "port/atomics/generic.h"
+
+/*
+ * Provide declarations for all functions here - on most platforms static
+ * inlines are used and these aren't necessary, but when static inline is
+ * unsupported these will be external functions.
+ */
+STATIC_IF_INLINE_DECLARE void pg_atomic_init_flag(volatile pg_atomic_flag *ptr);
+STATIC_IF_INLINE_DECLARE bool pg_atomic_test_set_flag(volatile pg_atomic_flag *ptr);
+STATIC_IF_INLINE_DECLARE bool pg_atomic_unlocked_test_flag(volatile pg_atomic_flag *ptr);
+STATIC_IF_INLINE_DECLARE void pg_atomic_clear_flag(volatile pg_atomic_flag *ptr);
+
+STATIC_IF_INLINE_DECLARE void pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr);
+STATIC_IF_INLINE_DECLARE void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 newval);
+STATIC_IF_INLINE_DECLARE bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr,
+															 uint32 *expected, uint32 newval);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_fetch_add_u32(volatile pg_atomic_uint32 *ptr, int32 add_);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_fetch_sub_u32(volatile pg_atomic_uint32 *ptr, int32 sub_);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_fetch_and_u32(volatile pg_atomic_uint32 *ptr, uint32 and_);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_);
+STATIC_IF_INLINE_DECLARE uint32 pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_);
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+STATIC_IF_INLINE_DECLARE void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr);
+STATIC_IF_INLINE_DECLARE void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 newval);
+STATIC_IF_INLINE_DECLARE bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr,
+															 uint64 *expected, uint64 newval);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_fetch_add_u64(volatile pg_atomic_uint64 *ptr, int64 add_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_add_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 add_);
+STATIC_IF_INLINE_DECLARE uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_);
+
+#endif /* PG_HAVE_64_BIT_ATOMICS */
+
+
+/*
+ * pg_compiler_barrier - prevent the compiler from moving code across
+ *
+ * A compiler barrier need not (and preferably should not) emit any actual
+ * machine code, but must act as an optimization fence: the compiler must not
+ * reorder loads or stores to main memory around the barrier.  However, the
+ * CPU may still reorder loads or stores at runtime, if the architecture's
+ * memory model permits this.
+ */
+#define pg_compiler_barrier()	pg_compiler_barrier_impl()
+
+/*
+ * pg_memory_barrier - prevent the CPU from reordering memory access
+ *
+ * A memory barrier must act as a compiler barrier, and in addition must
+ * guarantee that all loads and stores issued prior to the barrier are
+ * completed before any loads or stores issued after the barrier.  Unless
+ * loads and stores are totally ordered (which is not the case on most
+ * architectures) this requires issuing some sort of memory fencing
+ * instruction.
+ */
+#define pg_memory_barrier()	pg_memory_barrier_impl()
+
+/*
+ * pg_(read|write)_barrier - prevent the CPU from reordering memory access
+ *
+ * A read barrier must act as a compiler barrier, and in addition must
+ * guarantee that any loads issued prior to the barrier are completed before
+ * any loads issued after the barrier.	Similarly, a write barrier acts
+ * as a compiler barrier, and also orders stores.  Read and write barriers
+ * are thus weaker than a full memory barrier, but stronger than a compiler
+ * barrier.  In practice, on machines with strong memory ordering, read and
+ * write barriers may require nothing more than a compiler barrier.
+ */
+#define pg_read_barrier()	pg_read_barrier_impl()
+#define pg_write_barrier()	pg_write_barrier_impl()
+
+/*
+ * Spinloop delay - Allow CPU to relax in busy loops
+ */
+#define pg_spin_delay()	pg_spin_delay_impl()
+
+/*
+ * The following functions are wrapper functions around the platform specific
+ * implementation of the atomic operations performing common checks.
+ */
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+/*
+ * pg_atomic_init_flag - initialize atomic flag.
+ *
+ * No barrier semantics.
+ */
+STATIC_IF_INLINE_DECLARE void
+pg_atomic_init_flag(volatile pg_atomic_flag *ptr)
+{
+	AssertPointerAlignment(ptr, sizeof(*ptr));
+
+	pg_atomic_init_flag_impl(ptr);
+}
+
+/*
+ * pg_atomic_test_and_set_flag - TAS()
+ *
+ * Returns true if the flag has successfully been set, false otherwise.
+ *
+ * Acquire (including read barrier) semantics.
+ */
+STATIC_IF_INLINE_DECLARE bool
+pg_atomic_test_set_flag(volatile pg_atomic_flag *ptr)
+{
+	AssertPointerAlignment(ptr, sizeof(*ptr));
+
+	return pg_atomic_test_set_flag_impl(ptr);
+}
+
+/*
+ * pg_atomic_unlocked_test_flag - Check if the lock is free
+ *
+ * Returns true if the flag currently is not set, false otherwise.
+ *
+ * No barrier semantics.
+ */
+STATIC_IF_INLINE_DECLARE bool
+pg_atomic_unlocked_test_flag(volatile pg_atomic_flag *ptr)
+{
+	AssertPointerAlignment(ptr, sizeof(*ptr));
+
+	return pg_atomic_unlocked_test_flag_impl(ptr);
+}
+
+/*
+ * pg_atomic_clear_flag - release lock set by TAS()
+ *
+ * Release (including write barrier) semantics.
+ */
+STATIC_IF_INLINE_DECLARE void
+pg_atomic_clear_flag(volatile pg_atomic_flag *ptr)
+{
+	AssertPointerAlignment(ptr, sizeof(*ptr));
+
+	pg_atomic_clear_flag_impl(ptr);
+}
+
+
+/*
+ * pg_atomic_init_u32 - initialize atomic variable
+ *
+ * Has to be done before any concurrent usage..
+ *
+ * No barrier semantics.
+ */
+STATIC_IF_INLINE_DECLARE void
+pg_atomic_init_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+	AssertPointerAlignment(ptr, 4);
+
+	pg_atomic_init_u32_impl(ptr, val);
+}
+
+/*
+ * pg_atomic_write_u32 - unlocked write to atomic variable.
+ *
+ * The write is guaranteed to succeed as a whole, i.e. it's not possible to
+ * observe a partial write for any reader.
+ *
+ * No barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
+{
+	AssertPointerAlignment(ptr, 4);
+	return pg_atomic_read_u32_impl(ptr);
+}
+
+/*
+ * pg_atomic_read_u32 - unlocked read from atomic variable.
+ *
+ * The read is guaranteed to return a value as it has been written by this or
+ * another process at some point in the past. There's however no cache
+ * coherency interaction guaranteeing the value hasn't since been written to
+ * again.
+ *
+ * No barrier semantics.
+ */
+STATIC_IF_INLINE_DECLARE void
+pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+	AssertPointerAlignment(ptr, 4);
+
+	pg_atomic_write_u32_impl(ptr, val);
+}
+
+/*
+ * pg_atomic_exchange_u32 - exchange newval with current value
+ *
+ * Returns the old value of 'ptr' before the swap.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 newval)
+{
+	AssertPointerAlignment(ptr, 4);
+
+	return pg_atomic_exchange_u32_impl(ptr, newval);
+}
+
+/*
+ * pg_atomic_compare_exchange_u32 - CAS operation
+ *
+ * Atomically compare the current value of ptr with *expected and store newval
+ * iff ptr and *expected have the same value. The current value of *ptr will
+ * always be stored in *expected.
+ *
+ * Return true if values have been exchanged, false otherwise.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE bool
+pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr,
+							   uint32 *expected, uint32 newval)
+{
+	AssertPointerAlignment(ptr, 4);
+	AssertPointerAlignment(expected, 4);
+
+	return pg_atomic_compare_exchange_u32_impl(ptr, expected, newval);
+}
+
+/*
+ * pg_atomic_fetch_add_u32 - atomically add to variable
+ *
+ * Returns the the value of ptr before the arithmetic operation.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_fetch_add_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	AssertPointerAlignment(ptr, 4);
+	return pg_atomic_fetch_add_u32_impl(ptr, add_);
+}
+
+/*
+ * pg_atomic_fetch_sub_u32 - atomically subtract from variable
+ *
+ * Returns the the value of ptr before the arithmetic operation. Note that
+ * sub_ may not be INT_MIN due to platform limitations.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_fetch_sub_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
+{
+	AssertPointerAlignment(ptr, 4);
+	Assert(sub_ != INT_MIN);
+	return pg_atomic_fetch_sub_u32_impl(ptr, sub_);
+}
+
+/*
+ * pg_atomic_fetch_and_u32 - atomically bit-and and_ with variable
+ *
+ * Returns the the value of ptr before the arithmetic operation.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_fetch_and_u32(volatile pg_atomic_uint32 *ptr, uint32 and_)
+{
+	AssertPointerAlignment(ptr, 4);
+	return pg_atomic_fetch_and_u32_impl(ptr, and_);
+}
+
+/*
+ * pg_atomic_fetch_or_u32 - atomically bit-or or_ with variable
+ *
+ * Returns the the value of ptr before the arithmetic operation.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
+{
+	AssertPointerAlignment(ptr, 4);
+	return pg_atomic_fetch_or_u32_impl(ptr, or_);
+}
+
+/*
+ * pg_atomic_add_fetch_u32 - atomically add to variable
+ *
+ * Returns the the value of ptr after the arithmetic operation.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_add_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	AssertPointerAlignment(ptr, 4);
+	return pg_atomic_add_fetch_u32_impl(ptr, add_);
+}
+
+/*
+ * pg_atomic_sub_fetch_u32 - atomically subtract from variable
+ *
+ * Returns the the value of ptr after the arithmetic operation. Note that sub_
+ * may not be INT_MIN due to platform limitations.
+ *
+ * Full barrier semantics.
+ */
+STATIC_IF_INLINE uint32
+pg_atomic_sub_fetch_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
+{
+	AssertPointerAlignment(ptr, 4);
+	Assert(sub_ != INT_MIN);
+	return pg_atomic_sub_fetch_u32_impl(ptr, sub_);
+}
+
+/* ----
+ * The 64 bit operations have the same semantics as their 32bit counterparts
+ * if they are available. Check the corresponding 32bit function for
+ * documentation.
+ * ----
+ */
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+STATIC_IF_INLINE_DECLARE void
+pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
+{
+	AssertPointerAlignment(ptr, 8);
+
+	pg_atomic_init_u64_impl(ptr, val);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
+{
+	AssertPointerAlignment(ptr, 8);
+	return pg_atomic_read_u64_impl(ptr);
+}
+
+STATIC_IF_INLINE void
+pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
+{
+	AssertPointerAlignment(ptr, 8);
+	pg_atomic_write_u64_impl(ptr, val);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 newval)
+{
+	AssertPointerAlignment(ptr, 8);
+
+	return pg_atomic_exchange_u64_impl(ptr, newval);
+}
+
+STATIC_IF_INLINE bool
+pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr,
+							   uint64 *expected, uint64 newval)
+{
+	AssertPointerAlignment(ptr, 8);
+	AssertPointerAlignment(expected, 8);
+	return pg_atomic_compare_exchange_u64_impl(ptr, expected, newval);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_fetch_add_u64(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	AssertPointerAlignment(ptr, 8);
+	return pg_atomic_fetch_add_u64_impl(ptr, add_);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
+{
+	AssertPointerAlignment(ptr, 8);
+	Assert(sub_ != -INT64CONST(0x7FFFFFFFFFFFFFFF) - 1);
+	return pg_atomic_fetch_sub_u64_impl(ptr, sub_);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
+{
+	AssertPointerAlignment(ptr, 8);
+	return pg_atomic_fetch_and_u64_impl(ptr, and_);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
+{
+	AssertPointerAlignment(ptr, 8);
+	return pg_atomic_fetch_or_u64_impl(ptr, or_);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_add_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	AssertPointerAlignment(ptr, 8);
+	return pg_atomic_add_fetch_u64_impl(ptr, add_);
+}
+
+STATIC_IF_INLINE uint64
+pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
+{
+	AssertPointerAlignment(ptr, 8);
+	Assert(sub_ != -INT64CONST(0x7FFFFFFFFFFFFFFF) - 1);
+	return pg_atomic_sub_fetch_u64_impl(ptr, sub_);
+}
+
+#endif /* PG_HAVE_64_BIT_ATOMICS */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
+
+#undef INSIDE_ATOMICS_H
+
+#endif /* ATOMICS_H */
--- a/src/include/port/atomics/arch-arm.h
+++ b/src/include/port/atomics/arch-arm.h
@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-arm.h
+ *	  Atomic operations considerations specific to ARM
+ *
+ * Portions Copyright (c) 2013-2014, PostgreSQL Global Development Group
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-arm.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* intentionally no include guards, should only be included by atomics.h */
+#ifndef INSIDE_ATOMICS_H
+#error "should be included via atomics.h"
+#endif
+
+/*
+ * 64 bit atomics on arm are implemented using kernel fallbacks and might be
+ * slow, so disable entirely for now.
+ * XXX: We might want to change that at some point for AARCH64
+ */
+#define PG_DISABLE_64_BIT_ATOMICS
--- a/src/include/port/atomics/arch-hppa.h
+++ b/src/include/port/atomics/arch-hppa.h
@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-hppa.h
+ *	  Atomic operations considerations specific to HPPA
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-hppa.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* HPPA doesn't do either read or write reordering */
+#define pg_memory_barrier_impl()		pg_compiler_barrier_impl()
--- a/src/include/port/atomics/arch-ia64.h
+++ b/src/include/port/atomics/arch-ia64.h
@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-ia64.h
+ *	  Atomic operations considerations specific to intel itanium
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-ia64.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Itanium is weakly ordered, so read and write barriers require a full
+ * fence.
+ */
+#if defined(__INTEL_COMPILER)
+#	define pg_memory_barrier_impl()		__mf()
+#elif defined(__GNUC__)
+#	define pg_memory_barrier_impl()		__asm__ __volatile__ ("mf" : : : "memory")
+#elif defined(__hpux)
+#	define pg_memory_barrier_impl()		_Asm_mf()
+#endif
--- a/src/include/port/atomics/arch-ppc.h
+++ b/src/include/port/atomics/arch-ppc.h
@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-ppc.h
+ *	  Atomic operations considerations specific to PowerPC
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-ppc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#if defined(__GNUC__)
+
+/*
+ * lwsync orders loads with respect to each other, and similarly with stores.
+ * But a load can be performed before a subsequent store, so sync must be used
+ * for a full memory barrier.
+ */
+#define pg_memory_barrier_impl()	__asm__ __volatile__ ("sync" : : : "memory")
+#define pg_read_barrier_impl()		__asm__ __volatile__ ("lwsync" : : : "memory")
+#define pg_write_barrier_impl()		__asm__ __volatile__ ("lwsync" : : : "memory")
+#endif
--- a/src/include/port/atomics/arch-x86.h
+++ b/src/include/port/atomics/arch-x86.h
@ -0,0 +1,241 @@
+/*-------------------------------------------------------------------------
+ *
+ * arch-x86.h
+ *	  Atomic operations considerations specific to intel x86
+ *
+ * Note that we actually require a 486 upwards because the 386 doesn't have
+ * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
+ * anymore that's not much of restriction luckily.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * src/include/port/atomics/arch-x86.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
+ * or stores to be reordered with other stores, but a load can be performed
+ * before a subsequent store.
+ *
+ * Technically, some x86-ish chips support uncached memory access and/or
+ * special instructions that are weakly ordered.  In those cases we'd need
+ * the read and write barriers to be lfence and sfence.  But since we don't
+ * do those things, a compiler barrier should be enough.
+ *
+ * "lock; addl" has worked for longer than "mfence". It's also rumored to be
+ * faster in many scenarios
+ */
+
+#if defined(__INTEL_COMPILER)
+#define pg_memory_barrier_impl()		_mm_mfence()
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__i386))
+#define pg_memory_barrier_impl()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
+#elif defined(__GNUC__) && defined(__x86_64__)
+#define pg_memory_barrier_impl()		\
+	__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
+#endif
+
+#define pg_read_barrier_impl()		pg_compiler_barrier_impl()
+#define pg_write_barrier_impl()		pg_compiler_barrier_impl()
+
+/*
+ * Provide implementation for atomics using inline assembly on x86 gcc. It's
+ * nice to support older gcc's and the compare/exchange implementation here is
+ * actually more efficient than the * __sync variant.
+ */
+#if defined(HAVE_ATOMICS)
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+typedef struct pg_atomic_flag
+{
+	volatile char value;
+} pg_atomic_flag;
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+/*
+ * It's too complicated to write inline asm for 64bit types on 32bit and the
+ * 468 can't do it.
+ */
+#ifdef __x86_64__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+#endif
+
+#endif /* defined(HAVE_ATOMICS) */
+#endif /* defined(__GNUC__) && !defined(__INTEL_COMPILER) */
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#if !defined(PG_HAVE_SPIN_DELAY)
+/*
+ * This sequence is equivalent to the PAUSE instruction ("rep" is
+ * ignored by old IA32 processors if the following instruction is
+ * not a string operation); the IA-32 Architecture Software
+ * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
+ * PAUSE in the inner loop of a spin lock is necessary for good
+ * performance:
+ *
+ *     The PAUSE instruction improves the performance of IA-32
+ *     processors supporting Hyper-Threading Technology when
+ *     executing spin-wait loops and other routines where one
+ *     thread is accessing a shared lock or semaphore in a tight
+ *     polling loop. When executing a spin-wait loop, the
+ *     processor can suffer a severe performance penalty when
+ *     exiting the loop because it detects a possible memory order
+ *     violation and flushes the core processor's pipeline. The
+ *     PAUSE instruction provides a hint to the processor that the
+ *     code sequence is a spin-wait loop. The processor uses this
+ *     hint to avoid the memory order violation and prevent the
+ *     pipeline flush. In addition, the PAUSE instruction
+ *     de-pipelines the spin-wait loop to prevent it from
+ *     consuming execution resources excessively.
+ */
+#if defined(__INTEL_COMPILER)
+#define PG_HAVE_SPIN_DELAY
+static inline
+pg_spin_delay_impl(void)
+{
+	_mm_pause();
+}
+#elif defined(__GNUC__)
+#define PG_HAVE_SPIN_DELAY
+static __inline__ void
+pg_spin_delay_impl(void)
+{
+	__asm__ __volatile__(
+		" rep; nop			\n");
+}
+#elif defined(WIN32_ONLY_COMPILER) && defined(__x86_64__)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+	_mm_pause();
+}
+#elif defined(WIN32_ONLY_COMPILER)
+#define PG_HAVE_SPIN_DELAY
+static __forceinline void
+pg_spin_delay_impl(void)
+{
+	/* See comment for gcc code. Same code, MASM syntax */
+	__asm rep nop;
+}
+#endif
+#endif /* !defined(PG_HAVE_SPIN_DELAY) */
+
+
+#if defined(HAVE_ATOMICS)
+
+/* inline assembly implementation for gcc */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	register char _res = 1;
+
+	__asm__ __volatile__(
+		"	lock			\n"
+		"	xchgb	%0,%1	\n"
+:		"+q"(_res), "+m"(ptr->value)
+:
+:		"memory");
+	return _res == 0;
+}
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	char	ret;
+
+	/*
+	 * Perform cmpxchg and use the zero flag which it implicitly sets when
+	 * equal to measure the success.
+	 */
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	cmpxchgl	%4,%5	\n"
+		"   setz		%2		\n"
+:		"=a" (*expected), "=m"(ptr->value), "=r" (ret)
+:		"a" (*expected), "r" (newval), "m"(ptr->value)
+:		"memory", "cc");
+	return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	uint32 res;
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	xaddl	%0,%1		\n"
+:		"=q"(res), "=m"(ptr->value)
+:		"0" (add_), "m"(ptr->value)
+:		"memory", "cc");
+	return res;
+}
+
+#ifdef __x86_64__
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	char	ret;
+
+	/*
+	 * Perform cmpxchg and use the zero flag which it implicitly sets when
+	 * equal to measure the success.
+	 */
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	cmpxchgq	%4,%5	\n"
+		"   setz		%2		\n"
+:		"=a" (*expected), "=m"(ptr->value), "=r" (ret)
+:		"a" (*expected), "r" (newval), "m"(ptr->value)
+:		"memory", "cc");
+	return (bool) ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	uint64 res;
+	__asm__ __volatile__(
+		"	lock				\n"
+		"	xaddq	%0,%1		\n"
+:		"=q"(res), "=m"(ptr->value)
+:		"0" (add_), "m"(ptr->value)
+:		"memory", "cc");
+	return res;
+}
+
+#endif /* __x86_64__ */
+
+#endif /* defined(__GNUC__) && !defined(__INTEL_COMPILER) */
+
+#endif /* HAVE_ATOMICS */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/fallback.h
+++ b/src/include/port/atomics/fallback.h
@ -0,0 +1,132 @@
+/*-------------------------------------------------------------------------
+ *
+ * fallback.h
+ *    Fallback for platforms without spinlock and/or atomics support. Slower
+ *    than native atomics support, but not unusably slow.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/atomics/fallback.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* intentionally no include guards, should only be included by atomics.h */
+#ifndef INSIDE_ATOMICS_H
+#	error "should be included via atomics.h"
+#endif
+
+#ifndef pg_memory_barrier_impl
+/*
+ * If we have no memory barrier implementation for this architecture, we
+ * fall back to acquiring and releasing a spinlock.  This might, in turn,
+ * fall back to the semaphore-based spinlock implementation, which will be
+ * amazingly slow.
+ *
+ * It's not self-evident that every possible legal implementation of a
+ * spinlock acquire-and-release would be equivalent to a full memory barrier.
+ * For example, I'm not sure that Itanium's acq and rel add up to a full
+ * fence.  But all of our actual implementations seem OK in this regard.
+ */
+#define PG_HAVE_MEMORY_BARRIER_EMULATION
+
+extern void pg_spinlock_barrier(void);
+#define pg_memory_barrier_impl pg_spinlock_barrier
+#endif
+
+/*
+ * If we have atomics implementation for this platform fall back to providing
+ * the atomics API using a spinlock to protect the internal state. Possibly
+ * the spinlock implementation uses semaphores internally...
+ *
+ * We have to be a bit careful here, as it's not guaranteed that atomic
+ * variables are mapped to the same address in every process (e.g. dynamic
+ * shared memory segments). We can't just hash the address and use that to map
+ * to a spinlock. Instead assign a spinlock on initialization of the atomic
+ * variable.
+ */
+#if !defined(PG_HAVE_ATOMIC_FLAG_SUPPORT) && !defined(PG_HAVE_ATOMIC_U32_SUPPORT)
+
+#define PG_HAVE_ATOMIC_FLAG_SIMULATION
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+
+typedef struct pg_atomic_flag
+{
+	/*
+	 * To avoid circular includes we can't use s_lock as a type here. Instead
+	 * just reserve enough space for all spinlock types. Some platforms would
+	 * be content with just one byte instead of 4, but that's not too much
+	 * waste.
+	 */
+#if defined(__hppa) || defined(__hppa__)	/* HP PA-RISC, GCC and HP compilers */
+	int			sema[4];
+#else
+	int			sema;
+#endif
+} pg_atomic_flag;
+
+#endif /* PG_HAVE_ATOMIC_FLAG_SUPPORT */
+
+#if !defined(PG_HAVE_ATOMIC_U32_SUPPORT)
+
+#define PG_HAVE_ATOMIC_U32_SIMULATION
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	/* Check pg_atomic_flag's definition above for an explanation */
+#if defined(__hppa) || defined(__hppa__)	/* HP PA-RISC, GCC and HP compilers */
+	int			sema[4];
+#else
+	int			sema;
+#endif
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+#endif /* PG_HAVE_ATOMIC_U32_SUPPORT */
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#ifdef PG_HAVE_ATOMIC_FLAG_SIMULATION
+
+#define PG_HAVE_ATOMIC_INIT_FLAG
+extern void pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr);
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+extern bool pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr);
+
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+extern void pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr);
+
+#define PG_HAVE_ATOMIC_UNLOCKED_TEST_FLAG
+static inline bool
+pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/*
+	 * Can't do this efficiently in the semaphore based implementation - we'd
+	 * have to try to acquire the semaphore - so always return true. That's
+	 * correct, because this is only an unlocked test anyway.Do this in the
+	 * header so compilers can optimize the test away.
+	 */
+	return true;
+}
+
+#endif /* PG_HAVE_ATOMIC_FLAG_SIMULATION */
+
+#ifdef PG_HAVE_ATOMIC_U32_SIMULATION
+
+#define PG_HAVE_ATOMIC_INIT_U32
+extern void pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_);
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+extern bool pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+												uint32 *expected, uint32 newval);
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+extern uint32 pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_);
+
+#endif /* PG_HAVE_ATOMIC_U32_SIMULATION */
+
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/generic-acc.h
+++ b/src/include/port/atomics/generic-acc.h
@ -0,0 +1,99 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic-acc.h
+ *	  Atomic operations support when using HPs acc on HPUX
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * Documentation:
+ * * inline assembly for Itanium-based HP-UX:
+ *   http://h21007.www2.hp.com/portal/download/files/unprot/Itanium/inline_assem_ERS.pdf
+ * * Implementing Spinlocks on the Intel (R) Itanium (R) Architecture and PA-RISC
+ *   http://h21007.www2.hp.com/portal/download/files/unprot/itanium/spinlocks.pdf
+ *
+ * Itanium only supports a small set of numbers (6, -8, -4, -1, 1, 4, 8, 16)
+ * for atomic add/sub, so we just implement everything but compare_exchange
+ * via the compare_exchange fallbacks in atomics/generic.h.
+ *
+ * src/include/port/atomics/generic-acc.h
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include <machine/sys/inline.h>
+
+/* IA64 always has 32/64 bit atomics */
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+
+#define pg_compiler_barrier_impl()	_Asm_sched_fence()
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#define MINOR_FENCE (_Asm_fence) (_UP_CALL_FENCE | _UP_SYS_FENCE | \
+								 _DOWN_CALL_FENCE | _DOWN_SYS_FENCE )
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+STATIC_IF_INLINE bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool	ret;
+	uint32	current;
+
+	_Asm_mov_to_ar(_AREG_CCV, *expected, MINOR_FENCE);
+	/*
+	 * We want a barrier, not just release/acquire semantics.
+	 */
+	_Asm_mf();
+	/*
+	 * Notes:
+	 * DOWN_MEM_FENCE | _UP_MEM_FENCE prevents reordering by the compiler
+	 */
+	current =  _Asm_cmpxchg(_SZ_W, /* word */
+							_SEM_REL,
+							&ptr->value,
+							newval, _LDHINT_NONE,
+							_DOWN_MEM_FENCE | _UP_MEM_FENCE);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+STATIC_IF_INLINE bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	bool	ret;
+	uint64	current;
+
+	_Asm_mov_to_ar(_AREG_CCV, *expected, MINOR_FENCE);
+	_Asm_mf();
+	current =  _Asm_cmpxchg(_SZ_D, /* doubleword */
+							_SEM_REL,
+							&ptr->value,
+							newval, _LDHINT_NONE,
+							_DOWN_MEM_FENCE | _UP_MEM_FENCE);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#undef MINOR_FENCE
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/generic-gcc.h
+++ b/src/include/port/atomics/generic-gcc.h
@ -0,0 +1,236 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic-gcc.h
+ *	  Atomic operations, implemented using gcc (or compatible) intrinsics.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * Documentation:
+ * * Legacy __sync Built-in Functions for Atomic Memory Access
+ *   http://gcc.gnu.org/onlinedocs/gcc-4.8.2/gcc/_005f_005fsync-Builtins.html
+ * * Built-in functions for memory model aware atomic operations
+ *   http://gcc.gnu.org/onlinedocs/gcc-4.8.2/gcc/_005f_005fatomic-Builtins.html
+ *
+ * src/include/port/atomics/generic-gcc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* intentionally no include guards, should only be included by atomics.h */
+#ifndef INSIDE_ATOMICS_H
+#error "should be included via atomics.h"
+#endif
+
+/*
+ * icc provides all the same intrinsics but doesn't understand gcc's inline asm
+ */
+#if defined(__INTEL_COMPILER)
+/* NB: Yes, __memory_barrier() is actually just a compiler barrier */
+#define pg_compiler_barrier_impl()	__memory_barrier()
+#else
+#define pg_compiler_barrier_impl()	__asm__ __volatile__("" ::: "memory")
+#endif
+
+/*
+ * If we're on GCC 4.1.0 or higher, we should be able to get a memory barrier
+ * out of this compiler built-in.  But we prefer to rely on platform specific
+ * definitions where possible, and use this only as a fallback.
+ */
+#if !defined(pg_memory_barrier_impl)
+#	if defined(HAVE_GCC__ATOMIC_INT64_CAS)
+#		define pg_memory_barrier_impl()		__atomic_thread_fence(__ATOMIC_SEQ_CST)
+#	elif (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+#		define pg_memory_barrier_impl()		__sync_synchronize()
+#	endif
+#endif /* !defined(pg_memory_barrier_impl) */
+
+#if !defined(pg_read_barrier_impl) && defined(HAVE_GCC__ATOMIC_INT64_CAS)
+/* acquire semantics include read barrier semantics */
+#		define pg_read_barrier_impl()		__atomic_thread_fence(__ATOMIC_ACQUIRE)
+#endif
+
+#if !defined(pg_write_barrier_impl) && defined(HAVE_GCC__ATOMIC_INT64_CAS)
+/* release semantics include write barrier semantics */
+#		define pg_write_barrier_impl()		__atomic_thread_fence(__ATOMIC_RELEASE)
+#endif
+
+#ifdef HAVE_ATOMICS
+
+/* generic gcc based atomic flag implementation */
+#if !defined(PG_HAVE_ATOMIC_FLAG_SUPPORT) \
+	&& (defined(HAVE_GCC__SYNC_INT32_TAS) || defined(HAVE_GCC__SYNC_CHAR_TAS))
+
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+typedef struct pg_atomic_flag
+{
+	/* some platforms only have a 8 bit wide TAS */
+#ifdef HAVE_GCC__SYNC_CHAR_TAS
+	volatile char value;
+#else
+	/* but an int works on more platforms */
+	volatile int value;
+#endif
+} pg_atomic_flag;
+
+#endif /* !ATOMIC_FLAG_SUPPORT && SYNC_INT32_TAS */
+
+/* generic gcc based atomic uint32 implementation */
+#if !defined(PG_HAVE_ATOMIC_U32_SUPPORT) \
+	&& (defined(HAVE_GCC__ATOMIC_INT32_CAS) || defined(HAVE_GCC__SYNC_INT32_CAS))
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+#endif /* defined(HAVE_GCC__ATOMIC_INT32_CAS) || defined(HAVE_GCC__SYNC_INT32_CAS) */
+
+/* generic gcc based atomic uint64 implementation */
+#if !defined(PG_HAVE_ATOMIC_U64_SUPPORT) \
+	&& !defined(PG_DISABLE_64_BIT_ATOMICS) \
+	&& (defined(HAVE_GCC__ATOMIC_INT64_CAS) || defined(HAVE_GCC__SYNC_INT64_CAS))
+
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+
+#endif /* defined(HAVE_GCC__ATOMIC_INT64_CAS) || defined(HAVE_GCC__SYNC_INT64_CAS) */
+
+/*
+ * Implementation follows. Inlined or directly included from atomics.c
+ */
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#if !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG) && \
+	(defined(HAVE_GCC__SYNC_CHAR_TAS) || defined(HAVE_GCC__SYNC_INT32_TAS))
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/* NB: only a acquire barrier, not a full one */
+	/* some platform only support a 1 here */
+	return __sync_lock_test_and_set(&ptr->value, 1) == 0;
+}
+#endif /* !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG) && defined(HAVE_GCC__SYNC_*_TAS) */
+
+#ifndef PG_HAVE_ATOMIC_UNLOCKED_TEST_FLAG
+#define PG_HAVE_ATOMIC_UNLOCKED_TEST_FLAG
+static inline bool
+pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	return ptr->value == 0;
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_CLEAR_FLAG
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+static inline void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/*
+	 * XXX: It would be nicer to use __sync_lock_release here, but gcc insists
+	 * on making that an atomic op which is far to expensive and a stronger
+	 * guarantee than what we actually need.
+	 */
+	pg_write_barrier_impl();
+	ptr->value = 0;
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_INIT_FLAG
+#define PG_HAVE_ATOMIC_INIT_FLAG
+static inline void
+pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	pg_atomic_clear_flag_impl(ptr);
+}
+#endif
+
+/* prefer __atomic, it has a better API */
+#if !defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32) && defined(HAVE_GCC__ATOMIC_INT32_CAS)
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	/* FIXME: we can probably use a lower consistency model */
+	return __atomic_compare_exchange_n(&ptr->value, expected, newval, false,
+									   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32) && defined(HAVE_GCC__SYNC_INT32_CAS)
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool	ret;
+	uint32	current;
+	current = __sync_val_compare_and_swap(&ptr->value, *expected, newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_ADD_U32) && defined(HAVE_GCC__SYNC_INT32_CAS)
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	return __sync_fetch_and_add(&ptr->value, add_);
+}
+#endif
+
+
+#if !defined(PG_DISABLE_64_BIT_ATOMICS)
+
+#if !defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64) && defined(HAVE_GCC__ATOMIC_INT64_CAS)
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	return __atomic_compare_exchange_n(&ptr->value, expected, newval, false,
+									   __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64) && defined(HAVE_GCC__SYNC_INT64_CAS)
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	bool	ret;
+	uint64	current;
+	current = __sync_val_compare_and_swap(&ptr->value, *expected, newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_ADD_U64) && defined(HAVE_GCC__SYNC_INT64_CAS)
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	return __sync_fetch_and_add(&ptr->value, add_);
+}
+#endif
+
+#endif /* !defined(PG_DISABLE_64_BIT_ATOMICS) */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
+
+#endif /* defined(HAVE_ATOMICS) */
+
--- a/src/include/port/atomics/generic-msvc.h
+++ b/src/include/port/atomics/generic-msvc.h
@ -0,0 +1,103 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic-msvc.h
+ *	  Atomic operations support when using MSVC
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * NOTES:
+ *
+ * Documentation:
+ * * Interlocked Variable Access
+ *   http://msdn.microsoft.com/en-us/library/ms684122%28VS.85%29.aspx
+ *
+ * src/include/port/atomics/generic-msvc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <intrin.h>
+#include <windows.h>
+
+/* intentionally no include guards, should only be included by atomics.h */
+#ifndef INSIDE_ATOMICS_H
+#error "should be included via atomics.h"
+#endif
+
+/* Should work on both MSVC and Borland. */
+#pragma intrinsic(_ReadWriteBarrier)
+#define pg_compiler_barrier_impl()	_ReadWriteBarrier()
+
+#ifndef pg_memory_barrier_impl
+#define pg_memory_barrier_impl()	MemoryBarrier()
+#endif
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool	ret;
+	uint32	current;
+	current = InterlockedCompareExchange(&ptr->value, newval, *expected);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	return InterlockedExchangeAdd(&ptr->value, add_);
+}
+
+/*
+ * The non-intrinsics versions are only available in vista upwards, so use the
+ * intrinsic version. Only supported on >486, but we require XP as a minimum
+ * baseline, which doesn't support the 486, so we don't need to add checks for
+ * that case.
+ */
+#pragma intrinsic(_InterlockedCompareExchange64)
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	bool	ret;
+	uint64	current;
+	current = _InterlockedCompareExchange64(&ptr->value, newval, *expected);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+/* Only implemented on itanium and 64bit builds */
+#ifdef _WIN64
+#pragma intrinsic(_InterlockedExchangeAdd64)
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	return _InterlockedExchangeAdd64(&ptr->value, add_);
+}
+#endif /* _WIN64 */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/generic-sunpro.h
+++ b/src/include/port/atomics/generic-sunpro.h
@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic-sunpro.h
+ *	  Atomic operations for solaris' CC
+ *
+ * Portions Copyright (c) 2013-2014, PostgreSQL Global Development Group
+ *
+ * NOTES:
+ *
+ * Documentation:
+ * * manpage for atomic_cas(3C)
+ *   http://www.unix.com/man-page/opensolaris/3c/atomic_cas/
+ *   http://docs.oracle.com/cd/E23824_01/html/821-1465/atomic-cas-3c.html
+ *
+ * src/include/port/atomics/generic-sunpro.h
+ *
+ * -------------------------------------------------------------------------
+ */
+
+/* Older versions of the compiler don't have atomic.h... */
+#ifdef HAVE_ATOMIC_H
+
+#include <atomic.h>
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+
+#endif /* HAVE_ATOMIC_H */
+
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#ifdef HAVE_ATOMIC_H
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool	ret;
+	uint32	current;
+
+	current = atomic_cas_32(&ptr->value, *expected, newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	bool	ret;
+	uint64	current;
+
+	current = atomic_cas_64(&ptr->value, *expected, newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#endif /* HAVE_ATOMIC_H */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/generic-xlc.h
+++ b/src/include/port/atomics/generic-xlc.h
@ -0,0 +1,103 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic-xlc.h
+ *	  Atomic operations for IBM's CC
+ *
+ * Portions Copyright (c) 2013-2014, PostgreSQL Global Development Group
+ *
+ * NOTES:
+ *
+ * Documentation:
+ * * Synchronization and atomic built-in functions
+ *   http://publib.boulder.ibm.com/infocenter/lnxpcomp/v8v101/topic/com.ibm.xlcpp8l.doc/compiler/ref/bif_sync.htm
+ *
+ * src/include/port/atomics/generic-xlc.h
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include <atomic.h>
+
+#define PG_HAVE_ATOMIC_U32_SUPPORT
+typedef struct pg_atomic_uint32
+{
+	volatile uint32 value;
+} pg_atomic_uint32;
+
+
+/* 64bit atomics are only supported in 64bit mode */
+#ifdef __64BIT__
+#define PG_HAVE_ATOMIC_U64_SUPPORT
+typedef struct pg_atomic_uint64
+{
+	volatile uint64 value;
+} pg_atomic_uint64;
+
+#endif /* __64BIT__ */
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
+static inline bool
+pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
+									uint32 *expected, uint32 newval)
+{
+	bool	ret;
+	uint64	current;
+
+	/*
+	 * xlc's documentation tells us:
+	 * "If __compare_and_swap is used as a locking primitive, insert a call to
+	 * the __isync built-in function at the start of any critical sections."
+	 */
+	__isync();
+
+	/*
+	 * XXX: __compare_and_swap is defined to take signed parameters, but that
+	 * shouldn't matter since we don't perform any arithmetic operations.
+	 */
+	current = (uint32)__compare_and_swap((volatile int*)ptr->value,
+										 (int)*expected, (int)newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	return __fetch_and_add(&ptr->value, add_);
+}
+#endif
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
+static inline bool
+pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
+									uint64 *expected, uint64 newval)
+{
+	bool	ret;
+	uint64	current;
+
+	__isync();
+
+	current = (uint64)__compare_and_swaplp((volatile long*)ptr->value,
+										   (long)*expected, (long)newval);
+	ret = current == *expected;
+	*expected = current;
+	return ret;
+}
+
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	return __fetch_and_addlp(&ptr->value, add_);
+}
+#endif
+
+#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/port/atomics/generic.h
+++ b/src/include/port/atomics/generic.h
@ -0,0 +1,387 @@
+/*-------------------------------------------------------------------------
+ *
+ * generic.h
+ *	  Implement higher level operations based on some lower level tomic
+ *	  operations.
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/atomics/generic.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* intentionally no include guards, should only be included by atomics.h */
+#ifndef INSIDE_ATOMICS_H
+#	error "should be included via atomics.h"
+#endif
+
+/*
+ * If read or write barriers are undefined, we upgrade them to full memory
+ * barriers.
+ */
+#if !defined(pg_read_barrier_impl)
+#	define pg_read_barrier_impl pg_memory_barrier_impl
+#endif
+#if !defined(pg_write_barrier_impl)
+#	define pg_write_barrier_impl pg_memory_barrier_impl
+#endif
+
+#ifndef PG_HAVE_SPIN_DELAY
+#define PG_HAVE_SPIN_DELAY
+#define pg_spin_delay_impl()	((void)0)
+#endif
+
+
+/* provide fallback */
+#if !defined(PG_HAVE_ATOMIC_FLAG_SUPPORT) && defined(PG_HAVE_ATOMIC_U32_SUPPORT)
+#define PG_HAVE_ATOMIC_FLAG_SUPPORT
+typedef pg_atomic_uint32 pg_atomic_flag;
+#endif
+
+#if defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS)
+
+#ifndef PG_HAVE_ATOMIC_READ_U32
+#define PG_HAVE_ATOMIC_READ_U32
+static inline uint32
+pg_atomic_read_u32_impl(volatile pg_atomic_uint32 *ptr)
+{
+	return *(&ptr->value);
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_WRITE_U32
+#define PG_HAVE_ATOMIC_WRITE_U32
+static inline void
+pg_atomic_write_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val)
+{
+	ptr->value = val;
+}
+#endif
+
+/*
+ * provide fallback for test_and_set using atomic_exchange if available
+ */
+#if !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG) && defined(PG_HAVE_ATOMIC_EXCHANGE_U32)
+
+#define PG_HAVE_ATOMIC_INIT_FLAG
+static inline void
+pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	pg_atomic_write_u32_impl(ptr, 0);
+}
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	return pg_atomic_exchange_u32_impl(ptr, &value, 1) == 0;
+}
+
+#define PG_HAVE_ATOMIC_UNLOCKED_TEST_FLAG
+static inline bool
+pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	return pg_atomic_read_u32_impl(ptr) == 0;
+}
+
+
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+static inline void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/* XXX: release semantics suffice? */
+	pg_memory_barrier_impl();
+	pg_atomic_write_u32_impl(ptr, 0);
+}
+
+/*
+ * provide fallback for test_and_set using atomic_compare_exchange if
+ * available.
+ */
+#elif !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+
+#define PG_HAVE_ATOMIC_INIT_FLAG
+static inline void
+pg_atomic_init_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	pg_atomic_write_u32_impl(ptr, 0);
+}
+
+#define PG_HAVE_ATOMIC_TEST_SET_FLAG
+static inline bool
+pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	uint32 value = 0;
+	return pg_atomic_compare_exchange_u32_impl(ptr, &value, 1);
+}
+
+#define PG_HAVE_ATOMIC_UNLOCKED_TEST_FLAG
+static inline bool
+pg_atomic_unlocked_test_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	return pg_atomic_read_u32_impl(ptr) == 0;
+}
+
+#define PG_HAVE_ATOMIC_CLEAR_FLAG
+static inline void
+pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
+{
+	/*
+	 * Use a memory barrier + plain write if we have a native memory
+	 * barrier. But don't do so if memory barriers use spinlocks - that'd lead
+	 * to circularity if flags are used to implement spinlocks.
+	 */
+#ifndef PG_HAVE_MEMORY_BARRIER_EMULATION
+	/* XXX: release semantics suffice? */
+	pg_memory_barrier_impl();
+	pg_atomic_write_u32_impl(ptr, 0);
+#else
+	uint32 value = 1;
+	pg_atomic_compare_exchange_u32_impl(ptr, &value, 0);
+#endif
+}
+
+#elif !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG)
+#	error "No pg_atomic_test_and_set provided"
+#endif /* !defined(PG_HAVE_ATOMIC_TEST_SET_FLAG) */
+
+
+#ifndef PG_HAVE_ATOMIC_INIT_U32
+#define PG_HAVE_ATOMIC_INIT_U32
+static inline void
+pg_atomic_init_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 val_)
+{
+	pg_atomic_write_u32_impl(ptr, val_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_EXCHANGE_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+#define PG_HAVE_ATOMIC_EXCHANGE_U32
+static inline uint32
+pg_atomic_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 xchg_)
+{
+	uint32 old;
+	while (true)
+	{
+		old = pg_atomic_read_u32_impl(ptr);
+		if (pg_atomic_compare_exchange_u32_impl(ptr, &old, xchg_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_ADD_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+#define PG_HAVE_ATOMIC_FETCH_ADD_U32
+static inline uint32
+pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	uint32 old;
+	while (true)
+	{
+		old = pg_atomic_read_u32_impl(ptr);
+		if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old + add_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_SUB_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+#define PG_HAVE_ATOMIC_FETCH_SUB_U32
+static inline uint32
+pg_atomic_fetch_sub_u32_impl(volatile pg_atomic_uint32 *ptr, int32 sub_)
+{
+	return pg_atomic_fetch_add_u32_impl(ptr, -sub_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_AND_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+#define PG_HAVE_ATOMIC_FETCH_AND_U32
+static inline uint32
+pg_atomic_fetch_and_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 and_)
+{
+	uint32 old;
+	while (true)
+	{
+		old = pg_atomic_read_u32_impl(ptr);
+		if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old & and_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_OR_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+#define PG_HAVE_ATOMIC_FETCH_OR_U32
+static inline uint32
+pg_atomic_fetch_or_u32_impl(volatile pg_atomic_uint32 *ptr, uint32 or_)
+{
+	uint32 old;
+	while (true)
+	{
+		old = pg_atomic_read_u32_impl(ptr);
+		if (pg_atomic_compare_exchange_u32_impl(ptr, &old, old | or_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_ADD_FETCH_U32) && defined(PG_HAVE_ATOMIC_FETCH_ADD_U32)
+#define PG_HAVE_ATOMIC_ADD_FETCH_U32
+static inline uint32
+pg_atomic_add_fetch_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
+{
+	return pg_atomic_fetch_add_u32_impl(ptr, add_) + add_;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_SUB_FETCH_U32) && defined(PG_HAVE_ATOMIC_FETCH_SUB_U32)
+#define PG_HAVE_ATOMIC_SUB_FETCH_U32
+static inline uint32
+pg_atomic_sub_fetch_u32_impl(volatile pg_atomic_uint32 *ptr, int32 sub_)
+{
+	return pg_atomic_fetch_sub_u32_impl(ptr, sub_) - sub_;
+}
+#endif
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+
+#if !defined(PG_HAVE_ATOMIC_EXCHANGE_U64) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64)
+#define PG_HAVE_ATOMIC_EXCHANGE_U64
+static inline uint64
+pg_atomic_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 xchg_)
+{
+	uint64 old;
+	while (true)
+	{
+		old = ptr->value;
+		if (pg_atomic_compare_exchange_u64_impl(ptr, &old, xchg_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_WRITE_U64
+#define PG_HAVE_ATOMIC_WRITE_U64
+static inline void
+pg_atomic_write_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 val)
+{
+	/*
+	 * 64 bit writes aren't safe on all platforms. In the generic
+	 * implementation implement them as an atomic exchange.
+	 */
+	pg_atomic_exchange_u64_impl(ptr, val);
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_READ_U64
+#define PG_HAVE_ATOMIC_READ_U64
+static inline uint64
+pg_atomic_read_u64_impl(volatile pg_atomic_uint64 *ptr)
+{
+	uint64 old = 0;
+
+	/*
+	 * 64 bit reads aren't safe on all platforms. In the generic
+	 * implementation implement them as a compare/exchange with 0. That'll
+	 * fail or succeed, but always return the old value. Possible might store
+	 * a 0, but only if the prev. value also was a 0 - i.e. harmless.
+	 */
+	pg_atomic_compare_exchange_u64_impl(ptr, &old, 0);
+
+	return old;
+}
+#endif
+
+#ifndef PG_HAVE_ATOMIC_INIT_U64
+#define PG_HAVE_ATOMIC_INIT_U64
+static inline void
+pg_atomic_init_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 val_)
+{
+	pg_atomic_write_u64_impl(ptr, val_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_ADD_U64) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64)
+#define PG_HAVE_ATOMIC_FETCH_ADD_U64
+static inline uint64
+pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	uint64 old;
+	while (true)
+	{
+		old = pg_atomic_read_u64_impl(ptr);
+		if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old + add_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_SUB_U64) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64)
+#define PG_HAVE_ATOMIC_FETCH_SUB_U64
+static inline uint64
+pg_atomic_fetch_sub_u64_impl(volatile pg_atomic_uint64 *ptr, int64 sub_)
+{
+	return pg_atomic_fetch_add_u64_impl(ptr, -sub_);
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_AND_U64) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64)
+#define PG_HAVE_ATOMIC_FETCH_AND_U64
+static inline uint64
+pg_atomic_fetch_and_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 and_)
+{
+	uint64 old;
+	while (true)
+	{
+		old = pg_atomic_read_u64_impl(ptr);
+		if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old & and_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_FETCH_OR_U64) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64)
+#define PG_HAVE_ATOMIC_FETCH_OR_U64
+static inline uint64
+pg_atomic_fetch_or_u64_impl(volatile pg_atomic_uint64 *ptr, uint64 or_)
+{
+	uint64 old;
+	while (true)
+	{
+		old = pg_atomic_read_u64_impl(ptr);
+		if (pg_atomic_compare_exchange_u64_impl(ptr, &old, old | or_))
+			break;
+	}
+	return old;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_ADD_FETCH_U64) && defined(PG_HAVE_ATOMIC_FETCH_ADD_U64)
+#define PG_HAVE_ATOMIC_ADD_FETCH_U64
+static inline uint64
+pg_atomic_add_fetch_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
+{
+	return pg_atomic_fetch_add_u64_impl(ptr, add_) + add_;
+}
+#endif
+
+#if !defined(PG_HAVE_ATOMIC_SUB_FETCH_U64) && defined(PG_HAVE_ATOMIC_FETCH_SUB_U64)
+#define PG_HAVE_ATOMIC_SUB_FETCH_U64
+static inline uint64
+pg_atomic_sub_fetch_u64_impl(volatile pg_atomic_uint64 *ptr, int64 sub_)
+{
+	return pg_atomic_fetch_sub_u64_impl(ptr, sub_) - sub_;
+}
+#endif
+
+#endif /* PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 */
+
+#endif /* defined(PG_USE_INLINE) || defined(ATOMICS_INCLUDE_DEFINITIONS) */
--- a/src/include/storage/barrier.h
+++ b/src/include/storage/barrier.h
@ -13,158 +13,11 @@
 #ifndef BARRIER_H
 #define BARRIER_H

-#include "storage/s_lock.h"
-
-extern slock_t dummy_spinlock;
-
 /*
- * A compiler barrier need not (and preferably should not) emit any actual
- * machine code, but must act as an optimization fence: the compiler must not
- * reorder loads or stores to main memory around the barrier.  However, the
- * CPU may still reorder loads or stores at runtime, if the architecture's
- * memory model permits this.
- *
- * A memory barrier must act as a compiler barrier, and in addition must
- * guarantee that all loads and stores issued prior to the barrier are
- * completed before any loads or stores issued after the barrier.  Unless
- * loads and stores are totally ordered (which is not the case on most
- * architectures) this requires issuing some sort of memory fencing
- * instruction.
- *
- * A read barrier must act as a compiler barrier, and in addition must
- * guarantee that any loads issued prior to the barrier are completed before
- * any loads issued after the barrier.  Similarly, a write barrier acts
- * as a compiler barrier, and also orders stores.  Read and write barriers
- * are thus weaker than a full memory barrier, but stronger than a compiler
- * barrier.  In practice, on machines with strong memory ordering, read and
- * write barriers may require nothing more than a compiler barrier.
- *
- * For an introduction to using memory barriers within the PostgreSQL backend,
- * see src/backend/storage/lmgr/README.barrier
+ * This used to be a separate file, full of compiler/architecture
+ * dependent defines, but it's not included in the atomics.h
+ * infrastructure and just kept for backward compatibility.
 */
-
-#if defined(DISABLE_BARRIERS)
-
-/*
- * Fall through to the spinlock-based implementation.
- */
-#elif defined(__INTEL_COMPILER)
-
-/*
- * icc defines __GNUC__, but doesn't support gcc's inline asm syntax
- */
-#if defined(__ia64__) || defined(__ia64)
-#define pg_memory_barrier()		__mf()
-#elif defined(__i386__) || defined(__x86_64__)
-#define pg_memory_barrier()		_mm_mfence()
-#endif
-
-#define pg_compiler_barrier()	__memory_barrier()
-#elif defined(__GNUC__)
-
-/* This works on any architecture, since it's only talking to GCC itself. */
-#define pg_compiler_barrier()	__asm__ __volatile__("" : : : "memory")
-
-#if defined(__i386__)
-
-/*
- * i386 does not allow loads to be reordered with other loads, or stores to be
- * reordered with other stores, but a load can be performed before a subsequent
- * store.
- *
- * "lock; addl" has worked for longer than "mfence".
- */
-#define pg_memory_barrier()		\
-	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
-#define pg_read_barrier()		pg_compiler_barrier()
-#define pg_write_barrier()		pg_compiler_barrier()
-#elif defined(__x86_64__)		/* 64 bit x86 */
-
-/*
- * x86_64 has similar ordering characteristics to i386.
- *
- * Technically, some x86-ish chips support uncached memory access and/or
- * special instructions that are weakly ordered.  In those cases we'd need
- * the read and write barriers to be lfence and sfence.  But since we don't
- * do those things, a compiler barrier should be enough.
- */
-#define pg_memory_barrier()		\
-	__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
-#define pg_read_barrier()		pg_compiler_barrier()
-#define pg_write_barrier()		pg_compiler_barrier()
-#elif defined(__ia64__) || defined(__ia64)
-
-/*
- * Itanium is weakly ordered, so read and write barriers require a full
- * fence.
- */
-#define pg_memory_barrier()		__asm__ __volatile__ ("mf" : : : "memory")
-#elif defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
-
-/*
- * lwsync orders loads with respect to each other, and similarly with stores.
- * But a load can be performed before a subsequent store, so sync must be used
- * for a full memory barrier.
- */
-#define pg_memory_barrier()		__asm__ __volatile__ ("sync" : : : "memory")
-#define pg_read_barrier()		__asm__ __volatile__ ("lwsync" : : : "memory")
-#define pg_write_barrier()		__asm__ __volatile__ ("lwsync" : : : "memory")
-
-#elif defined(__hppa) || defined(__hppa__)		/* HP PA-RISC */
-
-/* HPPA doesn't do either read or write reordering */
-#define pg_memory_barrier()		pg_compiler_barrier()
-#elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)
-
-/*
- * If we're on GCC 4.1.0 or higher, we should be able to get a memory
- * barrier out of this compiler built-in.  But we prefer to rely on our
- * own definitions where possible, and use this only as a fallback.
- */
-#define pg_memory_barrier()		__sync_synchronize()
-#endif
-#elif defined(__ia64__) || defined(__ia64)
-
-#define pg_compiler_barrier()	_Asm_sched_fence()
-#define pg_memory_barrier()		_Asm_mf()
-#elif defined(WIN32_ONLY_COMPILER)
-
-/* Should work on both MSVC and Borland. */
-#include <intrin.h>
-#pragma intrinsic(_ReadWriteBarrier)
-#define pg_compiler_barrier()	_ReadWriteBarrier()
-#define pg_memory_barrier()		MemoryBarrier()
-#endif
-
-/*
- * If we have no memory barrier implementation for this architecture, we
- * fall back to acquiring and releasing a spinlock.  This might, in turn,
- * fall back to the semaphore-based spinlock implementation, which will be
- * amazingly slow.
- *
- * It's not self-evident that every possible legal implementation of a
- * spinlock acquire-and-release would be equivalent to a full memory barrier.
- * For example, I'm not sure that Itanium's acq and rel add up to a full
- * fence.  But all of our actual implementations seem OK in this regard.
- */
-#if !defined(pg_memory_barrier)
-#define pg_memory_barrier() \
-	do { S_LOCK(&dummy_spinlock); S_UNLOCK(&dummy_spinlock); } while (0)
-#endif
-
-/*
- * If read or write barriers are undefined, we upgrade them to full memory
- * barriers.
- *
- * If a compiler barrier is unavailable, you probably don't want a full
- * memory barrier instead, so if you have a use case for a compiler barrier,
- * you'd better use #ifdef.
- */
-#if !defined(pg_read_barrier)
-#define pg_read_barrier()			pg_memory_barrier()
-#endif
-#if !defined(pg_write_barrier)
-#define pg_write_barrier()			pg_memory_barrier()
-#endif
+#include "port/atomics.h"

 #endif   /* BARRIER_H */
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@ -309,7 +309,7 @@ tas(volatile slock_t *lock)
 * than other widths.
 */
 #if defined(__arm__) || defined(__arm) || defined(__aarch64__) || defined(__aarch64)
-#ifdef HAVE_GCC_INT_ATOMICS
+#ifdef HAVE_GCC__SYNC_INT32_TAS
 #define HAS_TEST_AND_SET

 #define TAS(lock) tas(lock)
@ -324,7 +324,7 @@ tas(volatile slock_t *lock)

 #define S_UNLOCK(lock) __sync_lock_release(lock)

-#endif	 /* HAVE_GCC_INT_ATOMICS */
+#endif	 /* HAVE_GCC__SYNC_INT32_TAS */
 #endif	 /* __arm__ || __arm || __aarch64__ || __aarch64 */


@ -889,12 +889,12 @@ typedef int slock_t;

 extern bool s_lock_free_sema(volatile slock_t *lock);
 extern void s_unlock_sema(volatile slock_t *lock);
-extern void s_init_lock_sema(volatile slock_t *lock);
+extern void s_init_lock_sema(volatile slock_t *lock, bool nested);
 extern int	tas_sema(volatile slock_t *lock);

 #define S_LOCK_FREE(lock)	s_lock_free_sema(lock)
 #define S_UNLOCK(lock)	 s_unlock_sema(lock)
-#define S_INIT_LOCK(lock)	s_init_lock_sema(lock)
+#define S_INIT_LOCK(lock)	s_init_lock_sema(lock, false)
 #define TAS(lock)	tas_sema(lock)


@ -955,6 +955,7 @@ extern int	tas(volatile slock_t *lock);		/* in port/.../tas.s, or
 #define TAS_SPIN(lock)	TAS(lock)
 #endif	 /* TAS_SPIN */

+extern slock_t dummy_spinlock;

 /*
 * Platform-independent out-of-line support routines
--- a/src/test/regress/expected/lock.out
+++ b/src/test/regress/expected/lock.out
@ -60,3 +60,11 @@ DROP TABLE lock_tbl2;
 DROP TABLE lock_tbl1;
 DROP SCHEMA lock_schema1 CASCADE;
 DROP ROLE regress_rol_lock1;
+-- atomic ops tests
+RESET search_path;
+SELECT test_atomic_ops();
+ test_atomic_ops 
+-----------------
+ t
+(1 row)
+
--- a/src/test/regress/input/create_function_1.source
+++ b/src/test/regress/input/create_function_1.source
@ -57,6 +57,11 @@ CREATE FUNCTION make_tuple_indirect (record)
        AS '@libdir@/regress@DLSUFFIX@'
        LANGUAGE C STRICT;

+CREATE FUNCTION test_atomic_ops()
+    RETURNS bool
+    AS '@libdir@/regress@DLSUFFIX@'
+    LANGUAGE C;
+
 -- Things that shouldn't work:

 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
--- a/src/test/regress/output/create_function_1.source
+++ b/src/test/regress/output/create_function_1.source
@ -51,6 +51,10 @@ CREATE FUNCTION make_tuple_indirect (record)
        RETURNS record
        AS '@libdir@/regress@DLSUFFIX@'
        LANGUAGE C STRICT;
+CREATE FUNCTION test_atomic_ops()
+    RETURNS bool
+    AS '@libdir@/regress@DLSUFFIX@'
+    LANGUAGE C;
 -- Things that shouldn't work:
 CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL
    AS 'SELECT ''not an integer'';';
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@ -18,6 +18,7 @@
 #include "executor/executor.h"
 #include "executor/spi.h"
 #include "miscadmin.h"
+#include "port/atomics.h"
 #include "utils/builtins.h"
 #include "utils/geo_decls.h"
 #include "utils/rel.h"
@ -865,3 +866,241 @@ wait_pid(PG_FUNCTION_ARGS)

 	PG_RETURN_VOID();
 }
+
+#ifndef PG_HAVE_ATOMIC_FLAG_SIMULATION
+static void
+test_atomic_flag(void)
+{
+	pg_atomic_flag flag;
+
+	pg_atomic_init_flag(&flag);
+
+	if (!pg_atomic_unlocked_test_flag(&flag))
+		elog(ERROR, "flag: unexpectedly set");
+
+	if (!pg_atomic_test_set_flag(&flag))
+		elog(ERROR, "flag: couldn't set");
+
+	if (pg_atomic_unlocked_test_flag(&flag))
+		elog(ERROR, "flag: unexpectedly unset");
+
+	if (pg_atomic_test_set_flag(&flag))
+		elog(ERROR, "flag: set spuriously #2");
+
+	pg_atomic_clear_flag(&flag);
+
+	if (!pg_atomic_unlocked_test_flag(&flag))
+		elog(ERROR, "flag: unexpectedly set #2");
+
+	if (!pg_atomic_test_set_flag(&flag))
+		elog(ERROR, "flag: couldn't set");
+
+	pg_atomic_clear_flag(&flag);
+}
+#endif /* PG_HAVE_ATOMIC_FLAG_SIMULATION */
+
+static void
+test_atomic_uint32(void)
+{
+	pg_atomic_uint32 var;
+	uint32 expected;
+	int i;
+
+	pg_atomic_init_u32(&var, 0);
+
+	if (pg_atomic_read_u32(&var) != 0)
+		elog(ERROR, "atomic_read_u32() #1 wrong");
+
+	pg_atomic_write_u32(&var, 3);
+
+	if (pg_atomic_read_u32(&var) != 3)
+		elog(ERROR, "atomic_read_u32() #2 wrong");
+
+	if (pg_atomic_fetch_add_u32(&var, 1) != 3)
+		elog(ERROR, "atomic_fetch_add_u32() #1 wrong");
+
+	if (pg_atomic_fetch_sub_u32(&var, 1) != 4)
+		elog(ERROR, "atomic_fetch_sub_u32() #1 wrong");
+
+	if (pg_atomic_sub_fetch_u32(&var, 3) != 0)
+		elog(ERROR, "atomic_sub_fetch_u32() #1 wrong");
+
+	if (pg_atomic_add_fetch_u32(&var, 10) != 10)
+		elog(ERROR, "atomic_add_fetch_u32() #1 wrong");
+
+	if (pg_atomic_exchange_u32(&var, 5) != 10)
+		elog(ERROR, "pg_atomic_exchange_u32() #1 wrong");
+
+	if (pg_atomic_exchange_u32(&var, 0) != 5)
+		elog(ERROR, "pg_atomic_exchange_u32() #0 wrong");
+
+	/* test around numerical limits */
+	if (pg_atomic_fetch_add_u32(&var, INT_MAX) != 0)
+		elog(ERROR, "pg_atomic_fetch_add_u32() #2 wrong");
+
+	if (pg_atomic_fetch_add_u32(&var, INT_MAX) != INT_MAX)
+		elog(ERROR, "pg_atomic_add_fetch_u32() #3 wrong");
+
+	pg_atomic_fetch_add_u32(&var, 1); /* top up to UINT_MAX */
+
+	if (pg_atomic_read_u32(&var) != UINT_MAX)
+		elog(ERROR, "atomic_read_u32() #2 wrong");
+
+	if (pg_atomic_fetch_sub_u32(&var, INT_MAX) != UINT_MAX)
+		elog(ERROR, "pg_atomic_fetch_sub_u32() #2 wrong");
+
+	if (pg_atomic_read_u32(&var) != (uint32)INT_MAX + 1)
+		elog(ERROR, "atomic_read_u32() #3 wrong: %u", pg_atomic_read_u32(&var));
+
+	expected = pg_atomic_sub_fetch_u32(&var, INT_MAX);
+	if (expected != 1)
+		elog(ERROR, "pg_atomic_sub_fetch_u32() #3 wrong: %u", expected);
+
+	pg_atomic_sub_fetch_u32(&var, 1);
+
+	/* fail exchange because of old expected */
+	expected = 10;
+	if (pg_atomic_compare_exchange_u32(&var, &expected, 1))
+		elog(ERROR, "atomic_compare_exchange_u32() changed value spuriously");
+
+	/* CAS is allowed to fail due to interrupts, try a couple of times */
+	for (i = 0; i < 1000; i++)
+	{
+		expected = 0;
+		if (!pg_atomic_compare_exchange_u32(&var, &expected, 1))
+			break;
+	}
+	if (i == 1000)
+		elog(ERROR, "atomic_compare_exchange_u32() never succeeded");
+	if (pg_atomic_read_u32(&var) != 1)
+		elog(ERROR, "atomic_compare_exchange_u32() didn't set value properly");
+
+	pg_atomic_write_u32(&var, 0);
+
+	/* try setting flagbits */
+	if (pg_atomic_fetch_or_u32(&var, 1) & 1)
+		elog(ERROR, "pg_atomic_fetch_or_u32() #1 wrong");
+
+	if (!(pg_atomic_fetch_or_u32(&var, 2) & 1))
+		elog(ERROR, "pg_atomic_fetch_or_u32() #2 wrong");
+
+	if (pg_atomic_read_u32(&var) != 3)
+		elog(ERROR, "invalid result after pg_atomic_fetch_or_u32()");
+
+	/* try clearing flagbits */
+	if ((pg_atomic_fetch_and_u32(&var, ~2) & 3) != 3)
+		elog(ERROR, "pg_atomic_fetch_and_u32() #1 wrong");
+
+	if (pg_atomic_fetch_and_u32(&var, ~1) != 1)
+		elog(ERROR, "pg_atomic_fetch_and_u32() #2 wrong: is %u",
+			 pg_atomic_read_u32(&var));
+	/* no bits set anymore */
+	if (pg_atomic_fetch_and_u32(&var, ~0) != 0)
+		elog(ERROR, "pg_atomic_fetch_and_u32() #3 wrong");
+}
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+static void
+test_atomic_uint64(void)
+{
+	pg_atomic_uint64 var;
+	uint64 expected;
+	int i;
+
+	pg_atomic_init_u64(&var, 0);
+
+	if (pg_atomic_read_u64(&var) != 0)
+		elog(ERROR, "atomic_read_u64() #1 wrong");
+
+	pg_atomic_write_u64(&var, 3);
+
+	if (pg_atomic_read_u64(&var) != 3)
+		elog(ERROR, "atomic_read_u64() #2 wrong");
+
+	if (pg_atomic_fetch_add_u64(&var, 1) != 3)
+		elog(ERROR, "atomic_fetch_add_u64() #1 wrong");
+
+	if (pg_atomic_fetch_sub_u64(&var, 1) != 4)
+		elog(ERROR, "atomic_fetch_sub_u64() #1 wrong");
+
+	if (pg_atomic_sub_fetch_u64(&var, 3) != 0)
+		elog(ERROR, "atomic_sub_fetch_u64() #1 wrong");
+
+	if (pg_atomic_add_fetch_u64(&var, 10) != 10)
+		elog(ERROR, "atomic_add_fetch_u64() #1 wrong");
+
+	if (pg_atomic_exchange_u64(&var, 5) != 10)
+		elog(ERROR, "pg_atomic_exchange_u64() #1 wrong");
+
+	if (pg_atomic_exchange_u64(&var, 0) != 5)
+		elog(ERROR, "pg_atomic_exchange_u64() #0 wrong");
+
+	/* fail exchange because of old expected */
+	expected = 10;
+	if (pg_atomic_compare_exchange_u64(&var, &expected, 1))
+		elog(ERROR, "atomic_compare_exchange_u64() changed value spuriously");
+
+	/* CAS is allowed to fail due to interrupts, try a couple of times */
+	for (i = 0; i < 100; i++)
+	{
+		expected = 0;
+		if (!pg_atomic_compare_exchange_u64(&var, &expected, 1))
+			break;
+	}
+	if (i == 100)
+		elog(ERROR, "atomic_compare_exchange_u64() never succeeded");
+	if (pg_atomic_read_u64(&var) != 1)
+		elog(ERROR, "atomic_compare_exchange_u64() didn't set value properly");
+
+	pg_atomic_write_u64(&var, 0);
+
+	/* try setting flagbits */
+	if (pg_atomic_fetch_or_u64(&var, 1) & 1)
+		elog(ERROR, "pg_atomic_fetch_or_u64() #1 wrong");
+
+	if (!(pg_atomic_fetch_or_u64(&var, 2) & 1))
+		elog(ERROR, "pg_atomic_fetch_or_u64() #2 wrong");
+
+	if (pg_atomic_read_u64(&var) != 3)
+		elog(ERROR, "invalid result after pg_atomic_fetch_or_u64()");
+
+	/* try clearing flagbits */
+	if ((pg_atomic_fetch_and_u64(&var, ~2) & 3) != 3)
+		elog(ERROR, "pg_atomic_fetch_and_u64() #1 wrong");
+
+	if (pg_atomic_fetch_and_u64(&var, ~1) != 1)
+		elog(ERROR, "pg_atomic_fetch_and_u64() #2 wrong: is "UINT64_FORMAT,
+			 pg_atomic_read_u64(&var));
+	/* no bits set anymore */
+	if (pg_atomic_fetch_and_u64(&var, ~0) != 0)
+		elog(ERROR, "pg_atomic_fetch_and_u64() #3 wrong");
+}
+#endif /* PG_HAVE_ATOMIC_U64_SUPPORT */
+
+
+PG_FUNCTION_INFO_V1(test_atomic_ops);
+Datum
+test_atomic_ops(PG_FUNCTION_ARGS)
+{
+	/* ---
+	 * Can't run the test under the semaphore emulation, it doesn't handle
+	 * checking two edge cases well:
+	 * - pg_atomic_unlocked_test_flag() always returns true
+	 * - locking a already locked flag blocks
+	 * it seems better to not test the semaphore fallback here, than weaken
+	 * the checks for the other cases. The semaphore code will be the same
+	 * everywhere, whereas the efficient implementations wont.
+	 * ---
+	 */
+#ifndef PG_HAVE_ATOMIC_FLAG_SIMULATION
+	test_atomic_flag();
+#endif
+
+	test_atomic_uint32();
+
+#ifdef PG_HAVE_ATOMIC_U64_SUPPORT
+	test_atomic_uint64();
+#endif
+
+	PG_RETURN_BOOL(true);
+}
--- a/src/test/regress/sql/lock.sql
+++ b/src/test/regress/sql/lock.sql
@ -64,3 +64,8 @@ DROP TABLE lock_tbl2;
 DROP TABLE lock_tbl1;
 DROP SCHEMA lock_schema1 CASCADE;
 DROP ROLE regress_rol_lock1;
+
+
+-- atomic ops tests
+RESET search_path;
+SELECT test_atomic_ops();