diff --git a/src/backend/port/atomics.c b/src/backend/port/atomics.c
new file mode 100644
index 86b5308..55a9910
*** a/src/backend/port/atomics.c
--- b/src/backend/port/atomics.c
*************** pg_atomic_fetch_add_u32_impl(volatile pg
*** 158,160 ****
--- 158,243 ----
  }
  
  #endif   /* PG_HAVE_ATOMIC_U32_SIMULATION */
+ 
+ #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && (defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__))
+ 
+ /*
+  * Optimized implementation for Power processors.  Atomic operations on Power
+  * processors are implemented using optimistic locking.  'lwarx' instruction
+  * 'reserves index', but that reservation could be broken on 'stwcx.' and then
+  * we have to retry.  Thus, each CAS operation is a loop.  But loop of CAS
+  * operation is two level nested loop.  Experiments on multicore Power machines
+  * shows that we can have huge benefit from having this operation done using
+  * single loop in assembly.
+  */
+ uint32
+ pg_atomic_fetch_mask_add_u32(volatile pg_atomic_uint32 *ptr,
+ 							 uint32 mask, uint32 increment)
+ {
+ 	uint32		result;
+ 
+ 	__asm__ __volatile__(
+ 	"	lwarx	%0,0,%4		\n" /* read *ptr and reserve index */
+ 	"	and		3,%0,%2		\n" /* calculate '*ptr & mask" */
+ 	"	cmpwi	3,0			\n" /* compare '*ptr & mark' vs 0 */
+ 	"	bne-	$+16		\n" /* exit on '*ptr & mark != 0' */
+ 	"	add		3,%0,%3		\n" /* calculate '*ptr + increment' */
+ 	"	stwcx.	3,0,%4		\n" /* try to store '*ptr + increment' into *ptr */
+ 	"	bne-	$-24		\n" /* retry if index reservation is broken */
+ #ifdef USE_PPC_LWSYNC
+ 	"	lwsync				\n"
+ #else
+ 	"	isync				\n"
+ #endif
+ 	: "=&r"(result), "+m"(*ptr)
+ 	: "r"(mask), "r"(increment), "r"(ptr)
+ 	: "memory", "cc", "r3");
+ 	return result;
+ }
+ 
+ #else
+ 
+ /*
+  * Generic implementation via loop of compare & exchange.
+  */
+ uint32
+ pg_atomic_fetch_mask_add_u32(volatile pg_atomic_uint32 *ptr,
+ 							 uint32 mask_, uint32 add_)
+ {
+ 	uint32		old_value;
+ 
+ 	/*
+ 	 * Read once outside the loop, later iterations will get the newer value
+ 	 * via compare & exchange.
+ 	 */
+ 	old_value = pg_atomic_read_u32(ptr);
+ 
+ 	/* loop until we've determined whether we could make an increment or not */
+ 	while (true)
+ 	{
+ 		uint32		desired_value;
+ 		bool		free;
+ 
+ 		desired_value = old_value;
+ 		free = (old_value & mask_) == 0;
+ 		if (free)
+ 			desired_value += add_;
+ 
+ 		/*
+ 		 * Attempt to swap in the value we are expecting. If we didn't see
+ 		 * masked bits to be clear, that's just the old value. If we saw them
+ 		 * as clear, we'll attempt to make an increment. The reason that we
+ 		 * always swap in the value is that this doubles as a memory barrier.
+ 		 * We could try to be smarter and only swap in values if we saw the
+ 		 * maked bits as clear, but benchmark haven't shown it as beneficial
+ 		 * so far.
+ 		 *
+ 		 * Retry if the value changed since we last looked at it.
+ 		 */
+ 		if (pg_atomic_compare_exchange_u32(ptr, &old_value, desired_value))
+ 			return old_value;
+ 	}
+ 	pg_unreachable();
+ }
+ 
+ #endif
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index ab81d94..766e3de
*** a/src/backend/storage/lmgr/lwlock.c
--- b/src/backend/storage/lmgr/lwlock.c
*************** GetLWLockIdentifier(uint32 classId, uint
*** 727,790 ****
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		old_state;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
  	/*
! 	 * Read once outside the loop, later iterations will get the newer value
! 	 * via compare & exchange.
  	 */
! 	old_state = pg_atomic_read_u32(&lock->state);
  
! 	/* loop until we've determined whether we could acquire the lock or not */
! 	while (true)
  	{
! 		uint32		desired_state;
! 		bool		lock_free;
! 
! 		desired_state = old_state;
! 
! 		if (mode == LW_EXCLUSIVE)
! 		{
! 			lock_free = (old_state & LW_LOCK_MASK) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_EXCLUSIVE;
! 		}
! 		else
! 		{
! 			lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
! 			if (lock_free)
! 				desired_state += LW_VAL_SHARED;
! 		}
! 
! 		/*
! 		 * Attempt to swap in the state we are expecting. If we didn't see
! 		 * lock to be free, that's just the old value. If we saw it as free,
! 		 * we'll attempt to mark it acquired. The reason that we always swap
! 		 * in the value is that this doubles as a memory barrier. We could try
! 		 * to be smarter and only swap in values if we saw the lock as free,
! 		 * but benchmark haven't shown it as beneficial so far.
! 		 *
! 		 * Retry if the value changed since we last looked at it.
! 		 */
! 		if (pg_atomic_compare_exchange_u32(&lock->state,
! 										   &old_state, desired_state))
! 		{
! 			if (lock_free)
! 			{
! 				/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 				if (mode == LW_EXCLUSIVE)
! 					lock->owner = MyProc;
  #endif
! 				return false;
! 			}
! 			else
! 				return true;	/* somebody else has the lock */
! 		}
  	}
- 	pg_unreachable();
  }
  
  /*
--- 727,772 ----
  static bool
  LWLockAttemptLock(LWLock *lock, LWLockMode mode)
  {
! 	uint32		old_state,
! 				mask,
! 				increment;
  
  	AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
  
+ 	if (mode == LW_EXCLUSIVE)
+ 	{
+ 		mask = LW_LOCK_MASK;
+ 		increment = LW_VAL_EXCLUSIVE;
+ 	}
+ 	else
+ 	{
+ 		mask = LW_VAL_EXCLUSIVE;
+ 		increment = LW_VAL_SHARED;
+ 	}
+ 
  	/*
! 	 * Use 'check mask then add' atomic which actually do all the useful job
! 	 * for us.
  	 */
! 	old_state = pg_atomic_fetch_mask_add_u32(&lock->state, mask, increment);
  
! 	/*
! 	 * If state was free according to the mask, we assume that operation was
! 	 * successful.
! 	 */
! 	if ((old_state & mask) == 0)
  	{
! 		/* Great! Got the lock. */
  #ifdef LOCK_DEBUG
! 		if (mode == LW_EXCLUSIVE)
! 			lock->owner = MyProc;
  #endif
! 		return false;
! 	}
! 	else
! 	{
! 		return true;	/* somebody else has the lock */
  	}
  }
  
  /*
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
new file mode 100644
index 2e2ec27..4ec0219
*** a/src/include/port/atomics.h
--- b/src/include/port/atomics.h
*************** pg_atomic_sub_fetch_u32(volatile pg_atom
*** 415,420 ****
--- 415,433 ----
  	return pg_atomic_sub_fetch_u32_impl(ptr, sub_);
  }
  
+ /*
+  * pg_atomic_fetch_mask_add_u32 - atomically check that masked bits in variable
+  * and if they are clear then add to variable.
+  *
+  * Returns the value of ptr before the atomic operation.
+  *
+  * Full barrier semantics.
+  */
+ extern uint32
+ pg_atomic_fetch_mask_add_u32(volatile pg_atomic_uint32 *ptr,
+ 							 uint32 mask_, uint32 add_);
+ 
+ 
  /* ----
   * The 64 bit operations have the same semantics as their 32bit counterparts
   * if they are available. Check the corresponding 32bit function for
