https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63173

            Bug ID: 63173
           Summary: performance problem with simd intrinsics vld2_dup_* on
                    aarch64-none-elf
           Product: gcc
           Version: 4.9.2
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: chenshanyaoboy at gmail dot com

Hi,
I found there is a performance problem with some simd intrinsics
(vld2_dup_*) on aarch64-none-elf. Now the vld2_dup_* are defined as
follows:

#define __LD2R_FUNC(rettype, structtype, ptrtype, \
    regsuffix, funcsuffix, Q) \
  __extension__ static __inline rettype \
  __attribute__ ((__always_inline__))  \
  vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \
  { \
    rettype result; \
    __asm__ ("ld2r {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
     "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
     : "=Q"(result) \
     : "Q"(*(const structtype *)ptr) \
     : "memory", "v16", "v17"); \
    return result; \
  }

It loads from memory to registers, and then store the value of
registers to memory as a result. Such code is terribly low in
performance because of redundant memory visit and limited registers
allocation.

cat test1.c

#include <arm_neon.h>
int 16x4x2_t foo(int16_t __restrict pDataA,
                 int16_t __restrict pDataB,)
{
    int 16x4x2_t DataA, DataB, DataC;

    DataA = vld2_dup_s16(pDataA);
    DataB = vld2_dup_s16(pDataB);

    DataC.val[0] = vqadd_s16( DataA.val[0], DataB.val[0] ); 
    DataC.val[1] = vqadd_s16( DataA.val[1], DataB.val[1] ); 

    return DataC;
}

aarch64-none-elf-gcc -S -O2 test1.c
cat test1.s

foo:
    sub  sp, sp, #16
    //start of user assembly
    ld2r {v16.4h, v17.4h}, [x0]
    st1  {v16.4h, v17.4h}, [sp]
    //end of user assembly
    ldr  d0, [sp]
    ldr  d1, [sp,8]
    //start of user assembly
    ld2r {v16.4h, v17.4h}, [x1]
    st1  {v16.4h, v17.4h}, [sp]
    //end of user assembly
    ldr  d2, [sp]
    sqadd v0.4h, v0.4h, v2.4h
    ldr  d2, [sp,8]
    add sp, sp, 16
    sqadd v1.4h, v1.4h, v2,4h
    ret


Some intinsics like vld2_* were similar to vld2_dup_*, but now they
are realized by builtin functions.

__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vld2_s16 (const int16_t * __a)
{
  int16x4x2_t ret;
  __builtin_aarch64_simd_oi __o;
  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
  return ret;
}

test2.c is similar to test1.c ,only vld2_dup_s16 is instead of vld2_s16
cat test2.c

#include <arm_neon.h>
int 16x4x2_t foo (int16_t __restrict pDataA,
                 int16_t __restrict pDataB,)
{
    int 16x4x2_t DataA, DataB, DataC;

    DataA = vld2_s16(pDataA);
    DataB = vld2_s16(pDataB);

    DataC.val[0] = vqadd_s16( DataA.val[0], DataB.val[0] ); 
    DataC.val[1] = vqadd_s16( DataA.val[1], DataB.val[1] ); 

    return DataC;
}
aarch64-none-elf-gcc -S -O2 test2.c
cat test2.s

foo:
    ld2 {v2.4h-v3.4h}, [x0]
    ld2 {v4.4h-v5.4h}, [x1]
    sqadd v1.4h, v5.4h, v3.4h
    sqadd v0.4h, v4.4h, v2.4h
    ret

Could vld2_dup_* also be written with builtin as vld2_* ?

Reply via email to