diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 3e13394..00a1807
*** a/src/backend/storage/lmgr/lwlock.c
--- b/src/backend/storage/lmgr/lwlock.c
*************** GetLWLockIdentifier(uint32 classId, uint
*** 728,791 ****
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		old_state;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
  	/*
! 	 * Read once outside the loop, later iterations will get the newer value
! 	 * via compare & exchange.
  	 */
! 	old_state = pg_atomic_read_u32(&lock->state);
  
! 	/* loop until we've determined whether we could acquire the lock or not */
! 	while (true)
  	{
! 		uint32		desired_state;
! 		bool		lock_free;
! 
! 		desired_state = old_state;
! 
! 		if (mode == LW_EXCLUSIVE)
! 		{
! 			lock_free = (old_state & LW_LOCK_MASK) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_EXCLUSIVE;
! 		}
! 		else
! 		{
! 			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_SHARED;
! 		}
! 
! 		/*
! 		 * Attempt to swap in the state we are expecting. If we didn't see
! 		 * lock to be free, that's just the old value. If we saw it as free,
! 		 * we'll attempt to mark it acquired. The reason that we always swap
! 		 * in the value is that this doubles as a memory barrier. We could try
! 		 * to be smarter and only swap in values if we saw the lock as free,
! 		 * but benchmark haven't shown it as beneficial so far.
! 		 *
! 		 * Retry if the value changed since we last looked at it.
! 		 */
! 		if (pg_atomic_compare_exchange_u32(&lock->state,
! 										   &old_state, desired_state))
! 		{
! 			if (lock_free)
! 			{
! 				/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 				if (mode == LW_EXCLUSIVE)
! 					lock->owner = MyProc;
  #endif
! 				return false;
! 			}
! 			else
! 				return true;	/* somebody else has the lock */
! 		}
  	}
- 	pg_unreachable();
  }
  
  /*
--- 728,773 ----
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		old_state,
! 				mask,
! 				increment;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
+ 	if (mode == LW_EXCLUSIVE)
+ 	{
+ 		mask = LW_LOCK_MASK;
+ 		increment = LW_VAL_EXCLUSIVE;
+ 	}
+ 	else
+ 	{
+ 		mask = LW_VAL_EXCLUSIVE;
+ 		increment = LW_VAL_SHARED;
+ 	}
+ 
  	/*
! 	 * Use 'check mask then add' atomic which actually do all the useful job
! 	 * for us.
  	 */
! 	old_state = pg_atomic_fetch_mask_add_u32(&lock->state, mask, increment);
  
! 	/*
! 	 * If state was free according to the mask, we assume that operation was
! 	 * successful.
! 	 */
! 	if ((old_state & mask) == 0)
  	{
! 		/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 		if (mode == LW_EXCLUSIVE)
! 			lock->owner = MyProc;
  #endif
! 		return false;
! 	}
! 	else
! 	{
! 		return true;	/* somebody else has the lock */
  	}
  }
  
  /*
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
new file mode 100644
index 2e2ec27..74c2a41
*** a/src/include/port/atomics.h
--- b/src/include/port/atomics.h
*************** pg_atomic_sub_fetch_u32(volatile pg_atom
*** 415,420 ****
--- 415,437 ----
  	return pg_atomic_sub_fetch_u32_impl(ptr, sub_);
  }
  
+ /*
+  * pg_atomic_fetch_mask_add_u32 - atomically check that masked bits in variable
+  * and if they are clear then add to variable.
+  *
+  * Returns the value of ptr before the atomic operation.
+  *
+  * Full barrier semantics.
+  */
+ static inline uint32
+ pg_atomic_fetch_mask_add_u32(volatile pg_atomic_uint32 *ptr,
+ 							 uint32 mask_, uint32 add_)
+ {
+ 	AssertPointerAlignment(ptr, 4);
+ 	return pg_atomic_fetch_mask_add_u32_impl(ptr, mask_, add_);
+ }
+ 
+ 
  /* ----
   * The 64 bit operations have the same semantics as their 32bit counterparts
   * if they are available. Check the corresponding 32bit function for
diff --git a/src/include/port/atomics/arch-ppc.h b/src/include/port/atomics/arch-ppc.h
new file mode 100644
index ed1cd9d..cce2b55
*** a/src/include/port/atomics/arch-ppc.h
--- b/src/include/port/atomics/arch-ppc.h
***************
*** 23,26 ****
--- 23,83 ----
  #define pg_memory_barrier_impl()	__asm__ __volatile__ ("sync" : : : "memory")
  #define pg_read_barrier_impl()		__asm__ __volatile__ ("lwsync" : : : "memory")
  #define pg_write_barrier_impl()		__asm__ __volatile__ ("lwsync" : : : "memory")
+ 
+ #if defined(HAVE_ATOMICS) \
+ 	&& (defined(HAVE_GCC__ATOMIC_INT32_CAS) || defined(HAVE_GCC__SYNC_INT32_CAS))
+ 
+ /*
+  * Declare pg_atomic_uint32 structure before generic-gcc.h does it in order to
+  * use it in function arguments.
+  */
+ #define PG_HAVE_ATOMIC_U32_SUPPORT
+ typedef struct pg_atomic_uint32
+ {
+ 	volatile uint32 value;
+ } pg_atomic_uint32;
+ 
+ /*
+  * Optimized implementation of pg_atomic_fetch_mask_add_u32() for Power
+  * processors.  Atomic operations on Power processors are implemented using
+  * optimistic locking.  'lwarx' instruction 'reserves index', but that
+  * reservation could be broken on 'stwcx.' and then we have to retry.  Thus,
+  * each CAS operation is a loop.  But loop of CAS operation is two level nested
+  * loop.  Experiments on multicore Power machines shows that we can have huge
+  * benefit from having this operation done using single loop in assembly.
+  */
+ #define PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32
+ static inline uint32
+ pg_atomic_fetch_mask_add_u32_impl(volatile pg_atomic_uint32 *ptr,
+ 								  uint32 mask, uint32 increment)
+ {
+ 	uint32		result,
+ 				tmp;
+ 
+ 	__asm__ __volatile__(
+ 	/* read *ptr and reserve index */
+ #ifdef USE_PPC_LWARX_MUTEX_HINT
+ 	"	lwarx	%0,0,%5,1	\n"
+ #else
+ 	"	lwarx	%0,0,%5		\n"
+ #endif
+ 	"	and		%1,%0,%3	\n" /* calculate '*ptr & mask" */
+ 	"	cmpwi	%1,0		\n" /* compare '*ptr & mark' vs 0 */
+ 	"	bne-	$+16		\n" /* exit on '*ptr & mark != 0' */
+ 	"	add		%1,%0,%4	\n" /* calculate '*ptr + increment' */
+ 	"	stwcx.	%1,0,%5		\n" /* try to store '*ptr + increment' into *ptr */
+ 	"	bne-	$-24		\n" /* retry if index reservation is broken */
+ #ifdef USE_PPC_LWSYNC
+ 	"	lwsync				\n"
+ #else
+ 	"	isync				\n"
+ #endif
+ 	: "=&r"(result), "=&r"(tmp), "+m"(*ptr)
+ 	: "r"(mask), "r"(increment), "r"(ptr)
+ 	: "memory", "cc");
+ 	return result;
+ }
+ 
+ #endif
+ 
  #endif
diff --git a/src/include/port/atomics/generic.h b/src/include/port/atomics/generic.h
new file mode 100644
index a5b29d8..ac934ce
*** a/src/include/port/atomics/generic.h
--- b/src/include/port/atomics/generic.h
*************** pg_atomic_sub_fetch_u64_impl(volatile pg
*** 390,392 ****
--- 390,439 ----
  #endif
  
  #endif /* PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 */
+ 
+ #if !defined(PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32) && defined(PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32)
+ #define PG_HAVE_ATOMIC_FETCH_MASK_ADD_U32
+ /*
+  * Generic implementation of pg_atomic_fetch_mask_add_u32() via loop
+  * of compare & exchange.
+  */
+ static inline uint32
+ pg_atomic_fetch_mask_add_u32_impl(volatile pg_atomic_uint32 *ptr,
+ 								  uint32 mask_, uint32 add_)
+ {
+ 	uint32		old_value;
+ 
+ 	/*
+ 	 * Read once outside the loop, later iterations will get the newer value
+ 	 * via compare & exchange.
+ 	 */
+ 	old_value = pg_atomic_read_u32_impl(ptr);
+ 
+ 	/* loop until we've determined whether we could make an increment or not */
+ 	while (true)
+ 	{
+ 		uint32		desired_value;
+ 		bool		free;
+ 
+ 		desired_value = old_value;
+ 		free = (old_value & mask_) == 0;
+ 		if (free)
+ 			desired_value += add_;
+ 
+ 		/*
+ 		 * Attempt to swap in the value we are expecting. If we didn't see
+ 		 * masked bits to be clear, that's just the old value. If we saw them
+ 		 * as clear, we'll attempt to make an increment. The reason that we
+ 		 * always swap in the value is that this doubles as a memory barrier.
+ 		 * We could try to be smarter and only swap in values if we saw the
+ 		 * maked bits as clear, but benchmark haven't shown it as beneficial
+ 		 * so far.
+ 		 *
+ 		 * Retry if the value changed since we last looked at it.
+ 		 */
+ 		if (pg_atomic_compare_exchange_u32_impl(ptr, &old_value, desired_value))
+ 			return old_value;
+ 	}
+ 	pg_unreachable();
+ }
+ #endif
