Hi Richard, The 04/16/2021 12:23, Richard Sandiford wrote: > Tamar Christina <tamar.christ...@arm.com> writes: > > diff --git a/gcc/config/aarch64/aarch64-sve.md > > b/gcc/config/aarch64/aarch64-sve.md > > index > > 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..2cdc6338902216760622a39b14f0076994458c98 > > 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -8657,6 +8657,22 @@ (define_insn "@aarch64_sve_<perm_insn><mode>" > > "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" > > ) > > > > +;; Special purpose permute used by the predicate generation instructions. > > +;; This version only accepts VNx16BI as input but can output as any > > predicate > > +;; type and will reinterpet the input registers as the type in operand 3. > > I think it would be more accurate to say something like: > > ;; Special purpose permute used by the predicate generation instructions. > ;; Unlike the normal permute patterns, these instructions operate on VNx16BI > ;; regardless of the element size, so that all input and output bits are > ;; well-defined. Operand 3 then indicates the size of the permute. > > > +(define_insn "@aarch64_sve_trn1_conv<mode>" > > + [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") > > + (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa") > > + (match_operand:VNx16BI 2 "register_operand" "Upa") > > + (clobber > > + (match_operand:PRED_ALL 3 "register_operand" "=Upa")) > > I don't think we need a register for operand 3. We could just use the > CONST0_RTX of the mode: > > (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero") >
Ah! good shout! I was wondering if I could avoid the clobber and this works great. Thanks! Bootstrapped and regtested on aarch64-none-linux-gnu and no issues. Ok for trunk and GCC 10? Regards, Tamar > (no need for a constraint). > > > + ] > > Formatting nit: ] is usually on the previous line. > > > + UNSPEC_TRN1_CONV))] > > + "TARGET_SVE" > > + "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>" > > +) > > + > > + > > Just one blank line here (sorry for the nitpick). > > > ;; > > ========================================================================= > > ;; == Conversions > > ;; > > ========================================================================= > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index > > 994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..61337881bfd05dbf6e84ada6810b87fa36dc989d > > 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -5481,12 +5481,13 @@ aarch64_expand_sve_const_pred_trn (rtx target, > > rtx_vector_builder &builder, > > } > > } > > > > - /* Emit the TRN1 itself. */ > > + /* Emit the TRN1 itself. We emit a TRN that will always take a > > + input registers as VNx16BI but re-interpret the results to > > + MODE. */ > > Here too I think the output register mode is as important as the > input register mode, since we rely on all bits of the output being > well-defined. How about something like: > > /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI > operands but permutes them as though they had mode MODE. */ > > Thanks, > Richard > > > machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); > > - target = aarch64_target_reg (target, mode); > > - emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target, > > - gen_lowpart (mode, a), > > - gen_lowpart (mode, b))); > > + target = aarch64_target_reg (target, GET_MODE (a)); > > + rtx type_reg = gen_reg_rtx (mode); > > + emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); > > return target; > > } > > > > diff --git a/gcc/config/aarch64/iterators.md > > b/gcc/config/aarch64/iterators.md > > index > > 5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4 > > 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -649,6 +649,7 @@ (define_c_enum "unspec" > > UNSPEC_UZP2Q ; Used in aarch64-sve.md. > > UNSPEC_ZIP1Q ; Used in aarch64-sve.md. > > UNSPEC_ZIP2Q ; Used in aarch64-sve.md. > > + UNSPEC_TRN1_CONV ; Used in aarch64-sve.md. > > UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md. > > UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md. > > UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md. > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > > b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > > new file mode 100644 > > index > > 0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c > > @@ -0,0 +1,25 @@ > > +/* { dg-additional-options "-O2 -fno-schedule-insns" } */ > > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ > > + > > +#include "arm_sve.h" > > + > > +/* > > +** foo: > > +** ptrue (p[0-7])\.d, all > > +** pfalse (p[0-7])\.b > > +** ptrue (p[0-7])\.s, all > > +** trn1 (p[0-7])\.d, \2\.d, \3\.d > > +** trn1 \2\.d, \1\.d, \3\.d > > +** faddv (h[0-31]), \4\, (z[0-31]).h > > +** faddv (h[0-31]), \2\, \6\.h > > +** str \5, [x0] > > +** str \7, [x0, 2] > > +** ret > > +*/ > > +void foo(svfloat16_t in, float16_t *dst) { > > + const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0); > > + const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0); > > + dst[0] = svaddv_f16(pg_f0, in); > > + dst[1] = svaddv_f16(pg_q0, in); > > +} > > + --
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 7db2938bb84e04d066a7b07574e5cf344a3a8fb6..b8b6f55e1607e5697620bf205fbe9edf3be7c549 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8657,6 +8657,20 @@ (define_insn "@aarch64_sve_<perm_insn><mode>" "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" ) +;; Special purpose permute used by the predicate generation instructions. +;; Unlike the normal permute patterns, these instructions operate on VNx16BI +;; regardless of the element size, so that all input and output bits are +;; well-defined. Operand 3 then indicates the size of the permute. +(define_insn "@aarch64_sve_trn1_conv<mode>" + [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (unspec:VNx16BI [(match_operand:VNx16BI 1 "register_operand" "Upa") + (match_operand:VNx16BI 2 "register_operand" "Upa") + (match_operand:PRED_ALL 3 "aarch64_simd_imm_zero")] + UNSPEC_TRN1_CONV))] + "TARGET_SVE" + "trn1\t%0.<PRED_ALL:Vetype>, %1.<PRED_ALL:Vetype>, %2.<PRED_ALL:Vetype>" +) + ;; ========================================================================= ;; == Conversions ;; ========================================================================= diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 994fafc2dc857ca5c7f345e49b47cc7e7dcf5900..2c113322ff0874ee8762e0a642368adaba8c3793 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -5481,12 +5481,12 @@ aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder, } } - /* Emit the TRN1 itself. */ + /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI + operands but permutes them as though they had mode MODE. */ machine_mode mode = aarch64_sve_pred_mode (permute_size).require (); - target = aarch64_target_reg (target, mode); - emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target, - gen_lowpart (mode, a), - gen_lowpart (mode, b))); + target = aarch64_target_reg (target, GET_MODE (a)); + rtx type_reg = CONST0_RTX (mode); + emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg)); return target; } diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 5f5abd60525ba52fdb466e94a92ff4d011bee5cd..cac33ae812b382cd55611b0da8a6e9eac3a513c4 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -649,6 +649,7 @@ (define_c_enum "unspec" UNSPEC_UZP2Q ; Used in aarch64-sve.md. UNSPEC_ZIP1Q ; Used in aarch64-sve.md. UNSPEC_ZIP2Q ; Used in aarch64-sve.md. + UNSPEC_TRN1_CONV ; Used in aarch64-sve.md. UNSPEC_COND_CMPEQ_WIDE ; Used in aarch64-sve.md. UNSPEC_COND_CMPGE_WIDE ; Used in aarch64-sve.md. UNSPEC_COND_CMPGT_WIDE ; Used in aarch64-sve.md. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c new file mode 100644 index 0000000000000000000000000000000000000000..525933863f7d67d76ba7afa4321346efa27ba000 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr100048.c @@ -0,0 +1,25 @@ +/* { dg-additional-options "-O2 -fno-schedule-insns" } */ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ + +#include "arm_sve.h" + +/* +** foo: +** ptrue (p[0-7])\.d, all +** pfalse (p[0-7])\.b +** ptrue (p[0-7])\.s, all +** trn1 (p[0-7])\.d, \2\.d, \3\.d +** trn1 \2\.d, \1\.d, \3\.d +** faddv (h[0-31]), \4\, (z[0-31]).h +** faddv (h[0-31]), \2\, \6\.h +** str \5, [x0] +** str \7, [x0, 2] +** ret +*/ +void foo(svfloat16_t in, float16_t *dst) { + const svbool_t pg_q0 = svdupq_n_b16(1, 0, 1, 0, 0, 0, 0, 0); + const svbool_t pg_f0 = svdupq_n_b16(1, 0, 0, 0, 0, 0, 0, 0); + dst[0] = svaddv_f16(pg_f0, in); + dst[1] = svaddv_f16(pg_q0, in); +} +