/* { dg-do compile } */ /* { dg-options "-O2 -m32 -mtune=generic" } */
typedef unsigned short int uint16_t; typedef unsigned int uint32_t; extern int get_src_stride(void); extern int get_dst_stride(void); void foo (uint32_t *pSrc, uint32_t *pDst, uint16_t width, uint16_t height) { uint32_t *dstLine; register uint32_t *dst; uint32_t *srcLine; register uint32_t *src; int dstStride, srcStride; uint16_t w; srcStride = get_src_stride (); dstStride = get_dst_stride (); dstLine = pDst; srcLine = pSrc; while (height--) { dst = dstLine; dstLine += dstStride; src = srcLine; srcLine += srcStride; w = width; while (w--) *dst++ = *src++ | 0xFF000000; } } generates extremely poor code for the inner loop in 4.1 and 4.2: .L6: movl -16(%ebp), %eax # src, subw $1, -34(%ebp) #, w addl $4, -16(%ebp) #, src movl (%eax), %ecx #, movl -20(%ebp), %eax # dst, orl $-16777216, %ecx #, movl %ecx, (%eax) #, addl $4, %eax #, cmpw $-1, -34(%ebp) #, w movl %eax, -20(%ebp) #, dst je .L4 #, jmp .L6 # I believe this has been introduced by the http://gcc.gnu.org/ml/gcc-patches/2005-07/msg02021.html patch and fixed by http://gcc.gnu.org/ml/gcc-patches/2007-01/msg02095.html on the trunk. The generated loop isn't perfect on the trunk: .L4: movl (%ebx), %eax #* src, tmp82 addl $4, %ebx #, src subw $1, -14(%ebp) #, w orl $-16777216, %eax #, tmp82 movl %eax, (%edi) # tmp82,* dst addl $4, %edi #, dst cmpw $-1, -14(%ebp) #, w je .L3 #, jmp .L4 # but still far better than what 4.1 and 4.2 generate. Slightly modified: typedef unsigned short int uint16_t; typedef unsigned int uint32_t; extern int get_src_stride(void); extern int get_dst_stride(void); void foo (uint32_t *pSrc, uint32_t *pDst, uint16_t width, uint16_t height) { uint32_t *dstLine; register uint32_t *dst; uint32_t *srcLine; register uint32_t *src; int dstStride, srcStride; uint32_t w; srcStride = get_src_stride (); dstStride = get_dst_stride (); dstLine = pDst; srcLine = pSrc; while (height--) { dst = dstLine; dstLine += dstStride; src = srcLine; srcLine += srcStride; for (w = 0; w < width; w++) dst[w] = src[w] | 0xFF000000; } } generates more compact code: .L4: movl (%edx,%ecx,4), %eax #* srcLine, tmp79 orl $-16777216, %eax #, tmp79 movl %eax, (%ebx,%ecx,4) # tmp79,* dstLine addl $1, %ecx #, w cmpl %esi, %ecx # width, w jae .L3 #, jmp .L4 # -- Summary: [4.1/4.2 Regression] Poor code for inner loop on i386 Product: gcc Version: 4.1.2 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: jakub at gcc dot gnu dot org GCC target triplet: i386-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32414