vec_combine is really one instruction on aarch64, provided that the lowpart element is in the same register as the destination vector. This patch adds patterns for that.
The patch fixes a regression from GCC 8. Before the patch: int64x2_t s64q_1(int64_t a0, int64_t a1) { if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) return (int64x2_t) { a1, a0 }; else return (int64x2_t) { a0, a1 }; } generated: fmov d0, x0 ins v0.d[1], x1 ins v0.d[1], x1 ret whereas GCC 8 generated the more respectable: dup v0.2d, x0 ins v0.d[1], x1 ret gcc/ * config/aarch64/predicates.md (aarch64_reg_or_mem_pair_operand): New predicate. * config/aarch64/aarch64-simd.md (*aarch64_combine_internal<mode>) (*aarch64_combine_internal_be<mode>): New patterns. gcc/testsuite/ * gcc.target/aarch64/vec-init-9.c: New test. * gcc.target/aarch64/vec-init-10.c: Likewise. * gcc.target/aarch64/vec-init-11.c: Likewise. --- gcc/config/aarch64/aarch64-simd.md | 62 ++++ gcc/config/aarch64/predicates.md | 4 + .../gcc.target/aarch64/vec-init-10.c | 15 + .../gcc.target/aarch64/vec-init-11.c | 12 + gcc/testsuite/gcc.target/aarch64/vec-init-9.c | 267 ++++++++++++++++++ 5 files changed, 360 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-10.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-11.c create mode 100644 gcc/testsuite/gcc.target/aarch64/vec-init-9.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d6cd4c70fe7..ead80396e70 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4326,6 +4326,25 @@ (define_insn "load_pair_lanes<mode>" [(set_attr "type" "neon_load1_1reg_q")] ) +;; This STP pattern is a partial duplicate of the general vec_concat patterns +;; below. The reason for having both of them is that the alternatives of +;; the later patterns do not have consistent register preferences: the STP +;; alternatives have no preference between GPRs and FPRs (and if anything, +;; the GPR form is more natural for scalar integers) whereas the other +;; alternatives *require* an FPR for operand 1 and prefer one for operand 2. +;; +;; Using "*" to hide the STP alternatives from the RA penalizes cases in +;; which the destination was always memory. On the other hand, expressing +;; the true preferences makes GPRs seem more palatable than they really are +;; for register destinations. +;; +;; Despite that, we do still want the general form to have STP alternatives, +;; in order to handle cases where a register destination is spilled. +;; +;; The best compromise therefore seemed to be to have a dedicated STP +;; pattern to catch cases in which the destination was always memory. +;; This dedicated pattern must come first. + (define_insn "store_pair_lanes<mode>" [(set (match_operand:<VDBL> 0 "aarch64_mem_pair_lanes_operand" "=Umn, Umn") (vec_concat:<VDBL> @@ -4338,6 +4357,49 @@ (define_insn "store_pair_lanes<mode>" [(set_attr "type" "neon_stp, store_16")] ) +;; Form a vector whose least significant half comes from operand 1 and whose +;; most significant half comes from operand 2. The register alternatives +;; tie the least significant half to the same register as the destination, +;; so that only the other half needs to be handled explicitly. For the +;; reasons given above, the STP alternatives use ? for constraints that +;; the register alternatives either don't accept or themselves disparage. + +(define_insn "*aarch64_combine_internal<mode>" + [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") + (vec_concat:<VDBL> + (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r") + (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))] + "TARGET_SIMD + && !BYTES_BIG_ENDIAN + && (register_operand (operands[0], <VDBL>mode) + || register_operand (operands[2], <MODE>mode))" + "@ + ins\t%0.d[1], %2.d[0] + ins\t%0.d[1], %2 + ld1\t{%0.d}[1], %2 + stp\t%d1, %d2, %y0 + stp\t%x1, %x2, %y0" + [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] +) + +(define_insn "*aarch64_combine_internal_be<mode>" + [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn") + (vec_concat:<VDBL> + (match_operand:VDC 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r") + (match_operand:VDC 1 "register_operand" "0, 0, 0, ?w, ?r")))] + "TARGET_SIMD + && BYTES_BIG_ENDIAN + && (register_operand (operands[0], <VDBL>mode) + || register_operand (operands[2], <MODE>mode))" + "@ + ins\t%0.d[1], %2.d[0] + ins\t%0.d[1], %2 + ld1\t{%0.d}[1], %2 + stp\t%d2, %d1, %y0 + stp\t%x2, %x1, %y0" + [(set_attr "type" "neon_ins_q, neon_from_gp_q, neon_load1_one_lane_q, neon_stp, store_16")] +) + ;; In this insn, operand 1 should be low, and operand 2 the high part of the ;; dest vector. diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 7dc4c155ea8..c308015ac2c 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -254,6 +254,10 @@ (define_predicate "aarch64_mem_pair_lanes_operand" false, ADDR_QUERY_LDP_STP_N)"))) +(define_predicate "aarch64_reg_or_mem_pair_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "aarch64_mem_pair_lanes_operand"))) + (define_predicate "aarch64_prefetch_operand" (match_test "aarch64_address_valid_for_prefetch_p (op, false)")) diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-10.c b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c new file mode 100644 index 00000000000..f5dd83b94b5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-10.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include <arm_neon.h> + +int64x2_t f1(int64_t *x, int c) { + return c ? (int64x2_t) { x[0], x[2] } : (int64x2_t) { 0, 0 }; +} + +int64x2_t f2(int64_t *x, int i0, int i1, int c) { + return c ? (int64x2_t) { x[i0], x[i1] } : (int64x2_t) { 0, 0 }; +} + +/* { dg-final { scan-assembler-times {\t(?:ldr\td[0-9]+|ld1\t)} 4 } } */ +/* { dg-final { scan-assembler-not {\tldr\tx} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-11.c b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c new file mode 100644 index 00000000000..df242702c0c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-11.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include <arm_neon.h> + +void f1(int64x2_t *res, int64_t *x, int c0, int c1) { + res[0] = (int64x2_t) { c0 ? x[0] : 0, c1 ? x[2] : 0 }; +} + +/* { dg-final { scan-assembler-times {\tldr\tx[0-9]+} 2 } } */ +/* { dg-final { scan-assembler {\tstp\tx[0-9]+, x[0-9]+} } } */ +/* { dg-final { scan-assembler-not {\tldr\td} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c new file mode 100644 index 00000000000..8f68e06a559 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c @@ -0,0 +1,267 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ + +#include <arm_neon.h> + +void ext(); + +/* +** s64q_1: +** fmov d0, x0 +** ins v0\.d\[1\], x1 +** ret +*/ +int64x2_t s64q_1(int64_t a0, int64_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (int64x2_t) { a1, a0 }; + else + return (int64x2_t) { a0, a1 }; +} +/* +** s64q_2: +** fmov d0, x0 +** ld1 {v0\.d}\[1\], \[x1\] +** ret +*/ +int64x2_t s64q_2(int64_t a0, int64_t *ptr) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (int64x2_t) { ptr[0], a0 }; + else + return (int64x2_t) { a0, ptr[0] }; +} +/* +** s64q_3: +** ldr d0, \[x0\] +** ins v0\.d\[1\], x1 +** ret +*/ +int64x2_t s64q_3(int64_t *ptr, int64_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (int64x2_t) { a1, ptr[0] }; + else + return (int64x2_t) { ptr[0], a1 }; +} +/* +** s64q_4: +** stp x1, x2, \[x0\] +** ret +*/ +void s64q_4(int64x2_t *res, int64_t a0, int64_t a1) { + res[0] = (int64x2_t) { a0, a1 }; +} +/* +** s64q_5: +** stp x1, x2, \[x0, #?8\] +** ret +*/ +void s64q_5(uintptr_t res, int64_t a0, int64_t a1) { + *(int64x2_t *)(res + 8) = (int64x2_t) { a0, a1 }; +} +/* +** s64q_6: +** ... +** stp x0, x1, .* +** ... +** ldr q0, .* +** ... +** ret +*/ +int64x2_t s64q_6(int64_t a0, int64_t a1) { + int64x2_t res = { a0, a1 }; + ext (); + return res; +} + +/* +** f64q_1: +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +float64x2_t f64q_1(float64_t a0, float64_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (float64x2_t) { a1, a0 }; + else + return (float64x2_t) { a0, a1 }; +} +/* +** f64q_2: +** ld1 {v0\.d}\[1\], \[x0\] +** ret +*/ +float64x2_t f64q_2(float64_t a0, float64_t *ptr) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (float64x2_t) { ptr[0], a0 }; + else + return (float64x2_t) { a0, ptr[0] }; +} +/* +** f64q_3: +** ldr d0, \[x0\] +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + return (float64x2_t) { a1, ptr[0] }; + else + return (float64x2_t) { ptr[0], a1 }; +} +/* +** f64q_4: +** stp d0, d1, \[x0\] +** ret +*/ +void f64q_4(float64x2_t *res, float64_t a0, float64_t a1) { + res[0] = (float64x2_t) { a0, a1 }; +} +/* +** f64q_5: +** stp d0, d1, \[x0, #?8\] +** ret +*/ +void f64q_5(uintptr_t res, float64_t a0, float64_t a1) { + *(float64x2_t *)(res + 8) = (float64x2_t) { a0, a1 }; +} +/* +** f64q_6: +** ... +** stp d0, d1, .* +** ... +** ldr q0, .* +** ... +** ret +*/ +float64x2_t f64q_6(float64_t a0, float64_t a1) { + float64x2_t res = { a0, a1 }; + ext (); + return res; +} + +/* +** s32q_1: +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) { + return vcombine_s32 (a0, a1); +} +/* +** s32q_2: +** ld1 {v0\.d}\[1\], \[x0\] +** ret +*/ +int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) { + return vcombine_s32 (a0, ptr[0]); +} +/* +** s32q_3: +** ldr d0, \[x0\] +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) { + return vcombine_s32 (ptr[0], a1); +} +/* +** s32q_4: +** stp d0, d1, \[x0\] +** ret +*/ +void s32q_4(int32x4_t *res, int32x2_t a0, int32x2_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + res[0] = vcombine_s32 (a1, a0); + else + res[0] = vcombine_s32 (a0, a1); +} +/* +** s32q_5: +** stp d0, d1, \[x0, #?8\] +** ret +*/ +void s32q_5(uintptr_t res, int32x2_t a0, int32x2_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + *(int32x4_t *)(res + 8) = vcombine_s32 (a1, a0); + else + *(int32x4_t *)(res + 8) = vcombine_s32 (a0, a1); +} +/* +** s32q_6: +** ... +** stp d0, d1, .* +** ... +** ldr q0, .* +** ... +** ret +*/ +int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) { + int32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ? vcombine_s32 (a1, a0) + : vcombine_s32 (a0, a1)); + ext (); + return res; +} + +/* +** f32q_1: +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) { + return vcombine_f32 (a0, a1); +} +/* +** f32q_2: +** ld1 {v0\.d}\[1\], \[x0\] +** ret +*/ +float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) { + return vcombine_f32 (a0, ptr[0]); +} +/* +** f32q_3: +** ldr d0, \[x0\] +** ins v0\.d\[1\], v1\.d\[0\] +** ret +*/ +float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) { + return vcombine_f32 (ptr[0], a1); +} +/* +** f32q_4: +** stp d0, d1, \[x0\] +** ret +*/ +void f32q_4(float32x4_t *res, float32x2_t a0, float32x2_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + res[0] = vcombine_f32 (a1, a0); + else + res[0] = vcombine_f32 (a0, a1); +} +/* +** f32q_5: +** stp d0, d1, \[x0, #?8\] +** ret +*/ +void f32q_5(uintptr_t res, float32x2_t a0, float32x2_t a1) { + if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + *(float32x4_t *)(res + 8) = vcombine_f32 (a1, a0); + else + *(float32x4_t *)(res + 8) = vcombine_f32 (a0, a1); +} +/* +** f32q_6: +** ... +** stp d0, d1, .* +** ... +** ldr q0, .* +** ... +** ret +*/ +float32x4_t f32q_6(float32x2_t a0, float32x2_t a1) { + float32x4_t res = (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ? vcombine_f32 (a1, a0) + : vcombine_f32 (a0, a1)); + ext (); + return res; +} -- 2.25.1