http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51534

             Bug #: 51534
           Summary: Bad code gen for vcgtq_u32 NEON intrinsic
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: rmansfi...@qnx.com
              Host: i686-unknown-linux-gnu
            Target: arm-unknown-linux-gnueabi
             Build: i686-unknown-linux-gnu


$ ./xgcc -v
Using built-in specs.
COLLECT_GCC=./xgcc
Target: arm-unknown-linux-gnueabi
Configured with: ../configure --target=arm-unknown-linux-gnueabi
--prefix=/home/ryan/x-tools/arm-unknown-linux-gnueabi
--with-sysroot=/home/ryan/x-tools/arm-unknown-linux-gnueabi/arm-unknown-linux-gnueabi//sys-root
--disable-multilib
--with-local-prefix=/home/ryan/x-tools/arm-unknown-linux-gnueabi/arm-unknown-linux-gnueabi/sys-root
--disable-nls --enable-threads=posix --enable-symvers=gnu --enable-c99
--enable-long-long --enable-target-optspace
target_alias=arm-unknown-linux-gnueabi --enable-languages=c++ --disable-shared
--disable-libmudflap --disable-libssp
Thread model: posix
gcc version 4.7.0 20111213 (experimental) [trunk revision 182291] (GCC) 

$ cat ~/foo.c
#include <arm_neon.h>

void foo (unsigned * src, unsigned *dst, int width)
{
  const int32x4_t vec_alpha_shift = vdupq_n_s32 (0);
  const uint32x4_t vec_one = vdupq_n_u32 (1u);
  const uint32x4_t vec_zero = vdupq_n_u32 (0u);

while (width >= 4)
    {
      uint32x4_t s0 = vld1q_u32 (src);
      uint32x4_t d0 = vld1q_u32 (dst);
      uint32x4_t vec_alpha = vshlq_u32 (s0, vec_alpha_shift);
      vec_alpha =
    vaddq_u32 (vec_alpha,
           vandq_u32 (vcgtq_u32 (vec_alpha, vec_zero), vec_one));
      s0 = vmulq_u32 (s0, vec_alpha);
      d0 = vaddq_u32 (s0, d0);
      vst1q_u32 (dst, d0);
    }
}

$ ./xgcc -B. -O3 -ftree-vectorize -mfpu=neon -mfloat-abi=softfp ~/foo.c 
-march=armv7-a -c

Changing the code from:

const uint32x4_t vec_zero = vdupq_n_u32 (0u)

to

const uint32x4_t vec_zero = vdupq_n_u32 (1u)

results in a proper reg load and operand to vcgt. 

     vmov.i32    q9, #0  @ v4si
     vld1.32    {d16-d17}, [r8]
+    vmov.i32    q12, #1  @ v4si
     mov    r0, sl
     vld1.32    {d20-d21}, [sl]
     vshl.u32    q9, q8, q9
-    vcgt.u32    q11, q9, #0
+    vcgt.u32    q11, q9, q12
     vand    q11, q11, q4
     vadd.i32    q9, q9, q11
     vmul.i32    q8, q8, q9

Also happens on the 4.6 branch. Compiles OK with 4.4 branch. I haven't checked
4.5 yet.

Reply via email to