After previous patches, we have a (mostly new) group of vec_concat patterns as well as vestiges of the old move_lo/hi_quad patterns. (A previous patch removed the move_lo_quad insns, but we still have the move_hi_quad insns and both sets of expanders.)
This patch is the first of two to remove the old move_lo/hi_quad stuff. It isn't technically a regression fix, but it seemed better to make the changes now rather than leave things in a half-finished and inconsistent state. This patch defines an aarch64_vec_concat expander that coerces the element operands into a valid form, including the ones added by the previous patch. This in turn lets us get rid of one move_lo/hi_quad pair. As a side-effect, it also means that vcombines of 2 vectors make better use of the available forms, like vec_inits of 2 scalars already do. gcc/ * config/aarch64/aarch64-protos.h (aarch64_split_simd_combine): Delete. * config/aarch64/aarch64-simd.md (@aarch64_combinez<mode>): Rename to... (*aarch64_combinez<mode>): ...this. (@aarch64_combinez_be<mode>): Rename to... (*aarch64_combinez_be<mode>): ...this. (@aarch64_vec_concat<mode>): New expander. (aarch64_combine<mode>): Use it. (@aarch64_simd_combine<mode>): Delete. * config/aarch64/aarch64.cc (aarch64_split_simd_combine): Delete. (aarch64_expand_vector_init): Use aarch64_vec_concat. gcc/testsuite/ * gcc.target/aarch64/vec-init-12.c: New test. --- gcc/config/aarch64/aarch64-protos.h | 2 - gcc/config/aarch64/aarch64-simd.md | 76 ++++++++++++------- gcc/config/aarch64/aarch64.cc | 55 ++------------ .../gcc.target/aarch64/vec-init-12.c | 65 ++++++++++++++++ 4 files changed, 122 insertions(+), 76 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-12.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b75ed35635b..392efa0b74d 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -925,8 +925,6 @@ bool aarch64_split_128bit_move_p (rtx, rtx); bool aarch64_mov128_immediate (rtx); -void aarch64_split_simd_combine (rtx, rtx, rtx); - void aarch64_split_simd_move (rtx, rtx); /* Check for a legitimate floating point constant for FMOV. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ead80396e70..7acde0dd099 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4403,7 +4403,7 @@ (define_insn "*aarch64_combine_internal_be<mode>" ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. -(define_insn "@aarch64_combinez<mode>" +(define_insn "*aarch64_combinez<mode>" [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") (vec_concat:<VDBL> (match_operand:VDC 1 "nonimmediate_operand" "w,?r,m") @@ -4417,7 +4417,7 @@ (define_insn "@aarch64_combinez<mode>" (set_attr "arch" "simd,fp,simd")] ) -(define_insn "@aarch64_combinez_be<mode>" +(define_insn "*aarch64_combinez_be<mode>" [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w") (vec_concat:<VDBL> (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero") @@ -4431,38 +4431,62 @@ (define_insn "@aarch64_combinez_be<mode>" (set_attr "arch" "simd,fp,simd")] ) -(define_expand "aarch64_combine<mode>" - [(match_operand:<VDBL> 0 "register_operand") - (match_operand:VDC 1 "register_operand") - (match_operand:VDC 2 "aarch64_simd_reg_or_zero")] +;; Form a vector whose first half (in array order) comes from operand 1 +;; and whose second half (in array order) comes from operand 2. +;; This operand order follows the RTL vec_concat operation. +(define_expand "@aarch64_vec_concat<mode>" + [(set (match_operand:<VDBL> 0 "register_operand") + (vec_concat:<VDBL> + (match_operand:VDC 1 "general_operand") + (match_operand:VDC 2 "general_operand")))] "TARGET_SIMD" { - if (operands[2] == CONST0_RTX (<MODE>mode)) + int lo = BYTES_BIG_ENDIAN ? 2 : 1; + int hi = BYTES_BIG_ENDIAN ? 1 : 2; + + if (MEM_P (operands[1]) + && MEM_P (operands[2]) + && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2])) + /* Use load_pair_lanes<mode>. */ + ; + else if (operands[hi] == CONST0_RTX (<MODE>mode)) { - if (BYTES_BIG_ENDIAN) - emit_insn (gen_aarch64_combinez_be<mode> (operands[0], operands[1], - operands[2])); - else - emit_insn (gen_aarch64_combinez<mode> (operands[0], operands[1], - operands[2])); + /* Use *aarch64_combinez<mode>. */ + if (!nonimmediate_operand (operands[lo], <MODE>mode)) + operands[lo] = force_reg (<MODE>mode, operands[lo]); } else - aarch64_split_simd_combine (operands[0], operands[1], operands[2]); - DONE; -} -) + { + /* Use *aarch64_combine_general<mode>. */ + operands[lo] = force_reg (<MODE>mode, operands[lo]); + if (!aarch64_simd_nonimmediate_operand (operands[hi], <MODE>mode)) + { + if (MEM_P (operands[hi])) + { + rtx addr = force_reg (Pmode, XEXP (operands[hi], 0)); + operands[hi] = replace_equiv_address (operands[hi], addr); + } + else + operands[hi] = force_reg (<MODE>mode, operands[hi]); + } + } +}) -(define_expand "@aarch64_simd_combine<mode>" +;; Form a vector whose least significant half comes from operand 1 and whose +;; most significant half comes from operand 2. This operand order follows +;; arm_neon.h vcombine* intrinsics. +(define_expand "aarch64_combine<mode>" [(match_operand:<VDBL> 0 "register_operand") - (match_operand:VDC 1 "register_operand") - (match_operand:VDC 2 "register_operand")] + (match_operand:VDC 1 "general_operand") + (match_operand:VDC 2 "general_operand")] "TARGET_SIMD" - { - emit_insn (gen_move_lo_quad_<Vdbl> (operands[0], operands[1])); - emit_insn (gen_move_hi_quad_<Vdbl> (operands[0], operands[2])); - DONE; - } -[(set_attr "type" "multiple")] +{ + if (BYTES_BIG_ENDIAN) + std::swap (operands[1], operands[2]); + emit_insn (gen_aarch64_vec_concat<mode> (operands[0], operands[1], + operands[2])); + DONE; +} ) ;; <su><addsub>l<q>. diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index c47543aebf3..af42d1bedfe 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -4239,23 +4239,6 @@ aarch64_split_128bit_move_p (rtx dst, rtx src) return true; } -/* Split a complex SIMD combine. */ - -void -aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) -{ - machine_mode src_mode = GET_MODE (src1); - machine_mode dst_mode = GET_MODE (dst); - - gcc_assert (VECTOR_MODE_P (dst_mode)); - gcc_assert (register_operand (dst, dst_mode) - && register_operand (src1, src_mode) - && register_operand (src2, src_mode)); - - emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2)); - return; -} - /* Split a complex SIMD move. */ void @@ -20941,37 +20924,13 @@ aarch64_expand_vector_init (rtx target, rtx vals) of mode N in VALS and we must put their concatentation into TARGET. */ if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0)))) { - gcc_assert (known_eq (GET_MODE_SIZE (mode), - 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0))))); - rtx lo = XVECEXP (vals, 0, 0); - rtx hi = XVECEXP (vals, 0, 1); - machine_mode narrow_mode = GET_MODE (lo); - gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode); - gcc_assert (narrow_mode == GET_MODE (hi)); - - /* When we want to concatenate a half-width vector with zeroes we can - use the aarch64_combinez[_be] patterns. Just make sure that the - zeroes are in the right half. */ - if (BYTES_BIG_ENDIAN - && aarch64_simd_imm_zero (lo, narrow_mode) - && general_operand (hi, narrow_mode)) - emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo)); - else if (!BYTES_BIG_ENDIAN - && aarch64_simd_imm_zero (hi, narrow_mode) - && general_operand (lo, narrow_mode)) - emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi)); - else - { - /* Else create the two half-width registers and combine them. */ - if (!REG_P (lo)) - lo = force_reg (GET_MODE (lo), lo); - if (!REG_P (hi)) - hi = force_reg (GET_MODE (hi), hi); - - if (BYTES_BIG_ENDIAN) - std::swap (lo, hi); - emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi)); - } + machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0)); + gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode + && known_eq (GET_MODE_SIZE (mode), + 2 * GET_MODE_SIZE (narrow_mode))); + emit_insn (gen_aarch64_vec_concat (narrow_mode, target, + XVECEXP (vals, 0, 0), + XVECEXP (vals, 0, 1))); return; } diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-12.c b/gcc/testsuite/gcc.target/aarch64/vec-init-12.c new file mode 100644 index 00000000000..c287478e2d8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-12.c @@ -0,0 +1,65 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#include <arm_neon.h> + +/* +** s32_1: +** ldr q0, \[x0\] +** ret +*/ +int32x4_t s32_1(int32x2_t *ptr) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return vcombine_s32 (ptr[1], ptr[0]); + else + return vcombine_s32 (ptr[0], ptr[1]); +} +/* +** s32_2: +** add x([0-9])+, x0, #?8 +** ld1 {v0\.d}\[1\], \[x\1\] +** ret +*/ +int32x4_t s32_2(int32x2_t a0, int32x2_t *ptr) { + return vcombine_s32 (a0, ptr[1]); +} +/* +** s32_3: +** ldr d0, \[x0\], #?16 +** ld1 {v0\.d}\[1\], \[x0\] +** ret +*/ +int32x4_t s32_3(int32x2_t *ptr) { + return vcombine_s32 (ptr[0], ptr[2]); +} + +/* +** f32_1: +** ldr q0, \[x0\] +** ret +*/ +float32x4_t f32_1(float32x2_t *ptr) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return vcombine_f32 (ptr[1], ptr[0]); + else + return vcombine_f32 (ptr[0], ptr[1]); +} +/* +** f32_2: +** add x([0-9])+, x0, #?8 +** ld1 {v0\.d}\[1\], \[x\1\] +** ret +*/ +float32x4_t f32_2(float32x2_t a0, float32x2_t *ptr) { + return vcombine_f32 (a0, ptr[1]); +} +/* +** f32_3: +** ldr d0, \[x0\], #?16 +** ld1 {v0\.d}\[1\], \[x0\] +** ret +*/ +float32x4_t f32_3(float32x2_t *ptr) { + return vcombine_f32 (ptr[0], ptr[2]); +} -- 2.25.1