Hi, In AArch64, SIMD instructions which only touch the bottom 64-bits of a vector register write zeroes to the upper 64-bits. In other words, we have a cheap way to implement a "zero extend" of a SIMD operation, and can generate efficient code for:
[(set (match_operand 0) (vec_concat:128-bit mode (other vector operations in a 64-bit mode) (match_operand 2 [zeroes])))] And for the big-endian equivalent of this. This small patch catches two important cases of this, namely loading a 64-bit vector and moving a 64-bit vector from general purpose registers to vector registers. Bootstrapped on aarch64-none-linux-gnu with no issues, and aarch64.exp run for aarch64_be-none-elf. Ok for trunk? Thanks, James --- gcc/ 2015-10-01 James Greenhalgh <james.greenha...@arm.com> * config/aarch64/aarch64-simd.md (*aarch64_combinez<mode>): Add alternatives for reads from memory and moves from general-purpose registers. (*aarch64_combinez_be<mode>): Likewise. 2015-10-01 James Greenhalgh <james.greenha...@arm.com> * gcc.target/aarch64/vect_combine_zeroes_1.c: New.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 541faf9..6a2ab61 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2530,23 +2530,33 @@ ;; dest vector. (define_insn "*aarch64_combinez<mode>" - [(set (match_operand:<VDBL> 0 "register_operand" "=&w") + [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") (vec_concat:<VDBL> - (match_operand:VD_BHSI 1 "register_operand" "w") - (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz")))] + (match_operand:VD_BHSI 1 "general_operand" "w,r,m") + (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz")))] "TARGET_SIMD && !BYTES_BIG_ENDIAN" - "mov\\t%0.8b, %1.8b" - [(set_attr "type" "neon_move<q>")] + "@ + mov\\t%0.8b, %1.8b + fmov\t%d0, %1 + ldr\\t%d0, %1" + [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg") + (set_attr "simd" "yes,*,yes") + (set_attr "fp" "*,yes,*")] ) (define_insn "*aarch64_combinez_be<mode>" - [(set (match_operand:<VDBL> 0 "register_operand" "=&w") + [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") (vec_concat:<VDBL> - (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz") - (match_operand:VD_BHSI 1 "register_operand" "w")))] + (match_operand:VD_BHSI 2 "aarch64_simd_imm_zero" "Dz,Dz,Dz") + (match_operand:VD_BHSI 1 "general_operand" "w,r,m")))] "TARGET_SIMD && BYTES_BIG_ENDIAN" - "mov\\t%0.8b, %1.8b" - [(set_attr "type" "neon_move<q>")] + "@ + mov\\t%0.8b, %1.8b + fmov\t%d0, %1 + ldr\\t%d0, %1" + [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg") + (set_attr "simd" "yes,*,yes") + (set_attr "fp" "*,yes,*")] ) (define_expand "aarch64_combine<mode>" diff --git a/gcc/testsuite/gcc.target/aarch64/vect_combine_zeroes_1.c b/gcc/testsuite/gcc.target/aarch64/vect_combine_zeroes_1.c new file mode 100644 index 0000000..6257fa9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect_combine_zeroes_1.c @@ -0,0 +1,24 @@ +/* { dg-options "-O2 --save-temps" } */ + +#include "arm_neon.h" + +int32x4_t +foo (int32x2_t *x) +{ + int32x2_t i = *x; + int32x2_t zeroes = vcreate_s32 (0l); + int32x4_t ret = vcombine_s32 (i, zeroes); + return ret; +} + +int32x4_t +bar (int64_t x) +{ + int32x2_t i = vcreate_s32 (x); + int32x2_t zeroes = vcreate_s32 (0l); + int32x4_t ret = vcombine_s32 (i, zeroes); + return ret; +} + +/* { dg-final { scan-assembler-not "mov\tv\[0-9\]+.8b, v\[0-9\]+.8b" } } */ +