Good morning everyone, I have an interesting situation. In this bit of code below, extracted from a simple testcase, I have a singly-linked list:
------------------------------<snip!>------------------------------ template <class list_node> class List { public: List() : head(__null) { } void insert (list_node *node) { List_insert (head, node); } list_node *head; }; class pthread_mutex: public verifyable_object { public: [ ... data members ... ] pthread_mutex (pthread_mutexattr * = __null); ~pthread_mutex (); class pthread_mutex * next; private: static List<pthread_mutex> mutexes; }; List<pthread_mutex> pthread_mutex::mutexes; pthread_mutex::pthread_mutex (pthread_mutexattr *attr) : [ ... member initialisers ... ] { [ ... code ... ] mutexes.insert (this); } ------------------------------<snip!>------------------------------ I am getting unexpected results in the inlined List.insert operation in the pthread_mutex constructor. The critical part of the code uses an inlined interlocked compare-and-exchange asm, that looks like this: ------------------------------<snip!>------------------------------ extern __inline__ long ilockcmpexch (volatile long *t, long v, long c) { return ({ __typeof (*t) ret; __asm __volatile ("lock cmpxchgl %2, %1" : "=a" (ret), "=m" (*t) : "r" (v), "m" (*t), "0" (c)); ret; }); } template <class list_node> inline void List_insert (list_node *&head, list_node *node) { if (!node) return; do node->next = head; while ((PVOID)ilockcmpexch((LONG volatile *)(&head),(LONG)(node),(LONG)(node->next)) != node->next); } ------------------------------<snip!>------------------------------ To my surprise, GCCs 4.3.2 and 4.3.3 at -O2 both sink the store to node->next after the call to ilockcmpexch: ------------------------------<snip!>------------------------------ __ZN13pthread_mutexC1EP17pthread_mutexattr: [ ... code ... ] L15: movl __ZN13pthread_mutex7mutexesE, %edx # mutexes.head, D.1991 movl %edx, %eax # D.1991, tmp69 /APP # 35 "mxfull.cpp" 1 lock cmpxchgl %esi, __ZN13pthread_mutex7mutexesE # this, # 0 "" 2 /NO_APP movl %eax, -12(%ebp) # tmp69, ret movl -12(%ebp), %eax # ret, D.1988 cmpl %eax, %edx # D.1988, D.1991 jne L15 #, movl %edx, 36(%esi) # D.1991, <variable>.next ------------------------------<snip!>------------------------------ This is obviously bad news for the consistency of the list; the value of 'head' is cached in %edx and not written to node->next until after the ilockcmpexch inline, meaning an incompletely-constructed node gets linked on the front of the chain for a window of several instructions. By contrast, a recent build from head does what I want: it writes to node->next in front of the ilockcmpexch, and only tests its value afterward: ------------------------------<snip!>------------------------------ __ZN13pthread_mutexC2EP17pthread_mutexattr: [ ... code ... ] L9: movl __ZN13pthread_mutex7mutexesE, %eax # mutexes.head, D.2119 movl %eax, 36(%ebx) # D.2119, <variable>.next /APP # 35 "mxfull.cpp" 1 lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE # this, # 0 "" 2 /NO_APP movl %eax, -12(%ebp) # tmp79, ret movl -12(%ebp), %eax # ret, D.2120 cmpl %eax, 36(%ebx) # D.2120, <variable>.next jne L9 #, ------------------------------<snip!>------------------------------ Adding a "memory" clobber to the inline asm works around the problem, causing 4.3 series to generate the same assembly as head, but I think it's a sledgehammer approach. Am I asking too much of GCC to not sink the store, or is 4.3 doing something wrong? I /think/ that the fact that there's a volatile store in ilockcmpexch means the earlier store shouldn't be moved past it, and that GCC is perhaps missing that the asm's output operand effectively represents a volatile write through *t, but I could be misunderstanding the rules about volatile. Anyone got their language lawyer's hat on at the moment? cheers, DaveK
// g++ -c mxfull.cpp -o mxfull.o --save-temps -O2 -fverbose-asm typedef long LONG; typedef void *HANDLE; typedef void *PVOID; typedef char *LPCSTR; typedef class pthread_mutex *pthread_mutex_t; typedef class pthread_mutexattr *pthread_mutexattr_t; typedef class pthread *pthread_t; struct SECURITY_ATTRIBUTES; typedef struct SECURITY_ATTRIBUTES *LPSECURITY_ATTRIBUTES; extern struct SECURITY_ATTRIBUTES sec_none_nih; HANDLE __attribute__((__stdcall__)) CreateSemaphoreA(LPSECURITY_ATTRIBUTES,LONG,LONG,LPCSTR); class verifyable_object { public: long magic; verifyable_object (long); virtual ~verifyable_object (); }; extern __inline__ long ilockcmpexch (volatile long *t, long v, long c) { return ({ __typeof (*t) ret; __asm __volatile ("lock cmpxchgl %2, %1" : "=a" (ret), "=m" (*t) : "r" (v), "m" (*t), "0" (c)); ret; }); } class pthread_mutexattr: public verifyable_object { public: int pshared; int mutextype; pthread_mutexattr (); ~pthread_mutexattr (); }; template <class list_node> inline void List_insert (list_node *&head, list_node *node) { if (!node) return; do node->next = head; while ((PVOID)ilockcmpexch((LONG volatile *)(&head),(LONG)(node),(LONG)(node->next)) != node->next); } template <class list_node> class List { public: List() : head(__null) { } ~List() { } void insert (list_node *node) { List_insert (head, node); } list_node *head; }; class pthread_mutex: public verifyable_object { public: unsigned long lock_counter; HANDLE win32_obj_id; unsigned int recursion_counter; LONG condwaits; pthread_t owner; int type; int pshared; pthread_mutex (pthread_mutexattr * = __null); ~pthread_mutex (); class pthread_mutex * next; private: static List<pthread_mutex> mutexes; }; List<pthread_mutex> pthread_mutex::mutexes; pthread_mutex::pthread_mutex (pthread_mutexattr *attr) : verifyable_object (0xdf0df045 +1), lock_counter (0), win32_obj_id (__null), recursion_counter (0), condwaits (0), owner (__null), type (1), pshared (0) { win32_obj_id = ::CreateSemaphoreA (&sec_none_nih, 0, 2147483647L, __null); if (!win32_obj_id) { magic = 0; return; } if (attr) { if (attr->pshared == 1) { magic = 0; return; } type = attr->mutextype; } mutexes.insert (this); }