https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63277
Bug ID: 63277 Summary: ARM - NEON excessive use of vmov for vtbl2 / uint8x8x2 for shuffling data unnecessarily around Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: janne-gcc at jannau dot net Created attachment 33500 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=33500&action=edit small example source code armv7a-hardfloat-linux-gnueabi-gcc-5.0.0 -v Using built-in specs. COLLECT_GCC=/opt/gcc/bin/armv7a-hardfloat-linux-gnueabi-gcc-5.0.0 COLLECT_LTO_WRAPPER=/opt/gcc/libexec/gcc/armv7a-hardfloat-linux-gnueabi/5.0.0/lto-wrapper Target: armv7a-hardfloat-linux-gnueabi Configured with: /home/janne/src/gcc-trunk/configure --host=x86_64-pc-linux-gnu --target=armv7a-hardfloat-linux-gnueabi --build=x86_64-pc-linux-gnu --prefix=/opt/gcc/ --enable-languages=c,c++,fortran --enable-obsolete --enable-secureplt --disable-werror --with-system-zlib --enable-nls --without-included-gettext --enable-checking=release --enable-libstdcxx-time --enable-poison-system-directories --with-sysroot=/usr/armv7a-hardfloat-linux-gnueabi --disable-bootstrap --enable-__cxa_atexit --enable-clocale=gnu --disable-multilib --disable-altivec --disable-fixed-point --with-float=hard --with-arch=armv7-a --with-float=hard --with-fpu=neon --disable-libgcj --enable-libgomp --disable-libmudflap --disable-libssp --enable-lto --without-cloog Thread model: posix gcc version 5.0.0 20140916 (experimental) (GCC) armv7a-hardfloat-linux-gnueabi-gcc-5.0.0 -march=armv7-a -mfpu=neon -O3 -S arm_neon_excessive_vmov.c -o - .arch armv7-a .eabi_attribute 27, 3 .eabi_attribute 28, 1 .fpu neon .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 .eabi_attribute 24, 1 .eabi_attribute 25, 1 .eabi_attribute 26, 2 .eabi_attribute 30, 2 .eabi_attribute 34, 1 .eabi_attribute 18, 4 .file "arm_neon_excessive_vmov.c" .text .align 2 .global gf_w8_split_multiply_region_neon .type gf_w8_split_multiply_region_neon, %function gf_w8_split_multiply_region_neon: @ args = 4, pretend = 0, frame = 0 @ frame_needed = 0, uses_anonymous_args = 0 str lr, [sp, #-4]! mov r3, r3, asl #4 ldr ip, [sp, #4] add lr, r0, #4096 add lr, lr, r3 add r0, r0, r3 vmov.i8 q15, #15 @ v16qi add ip, r2, ip, lsl #4 vld1.8 {d18-d19}, [lr] cmp r1, ip vld1.8 {d16-d17}, [r0] ldrcs pc, [sp], #4 vmov d27, d18 @ v8qi vmov d26, d19 @ v8qi vmov d25, d16 @ v8qi vmov d24, d17 @ v8qi .L3: vld1.8 {d18-d19}, [r1] vmov d20, d25 @ v8qi vmov d21, d24 @ v8qi add r1, r1, #16 vshr.u8 q14, q9, #4 cmp r1, ip vmov d22, d27 @ v8qi vmov d23, d26 @ v8qi vtbl.8 d16, {d20, d21}, d28 vand q9, q9, q15 vtbl.8 d28, {d20, d21}, d29 vtbl.8 d17, {d22, d23}, d18 vmov d29, d28 @ v8qi vmov d28, d16 @ v8qi vtbl.8 d16, {d22, d23}, d19 vswp d16, d17 veor q8, q8, q14 vst1.8 {d16-d17}, [r2] add r2, r2, #16 bcc .L3 ldr pc, [sp], #4 .size gf_w8_split_multiply_region_neon, .-gf_w8_split_multiply_region_neon .ident "GCC: (GNU) 5.0.0 20140916 (experimental)" .section .note.GNU-stack,"",%progbits There is no need for the vmovs/vswp and clang 3.4 generates from the same source file assembly without them.