Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > This adds an implementation for conditional branch optab for AArch64. > > For e.g. > > void f1 () > { > for (int i = 0; i < N; i++) > { > b[i] += a[i]; > if (a[i] > 0) > break; > } > } > > For 128-bit vectors we generate: > > cmgt v1.4s, v1.4s, #0 > umaxp v1.4s, v1.4s, v1.4s > fmov x3, d1 > cbnz x3, .L8 > > and of 64-bit vector we can omit the compression: > > cmgt v1.2s, v1.2s, #0 > fmov x2, d1 > cbz x2, .L13 > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md (cbranch<mode>4): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vect-early-break-cbranch.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 90118c6348e9614bef580d1dc94c0c1841dd5204..cd5ec35c3f53028f14828bd70a92924f62524c15 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -3830,6 +3830,46 @@ (define_expand "vcond_mask_<mode><v_int_equiv>" > DONE; > }) > > +;; Patterns comparing two vectors and conditionally jump > + > +(define_expand "cbranch<mode>4" > + [(set (pc) > + (if_then_else > + (match_operator 0 "aarch64_equality_operator" > + [(match_operand:VDQ_I 1 "register_operand") > + (match_operand:VDQ_I 2 "aarch64_simd_reg_or_zero")]) > + (label_ref (match_operand 3 "")) > + (pc)))] > + "TARGET_SIMD" > +{ > + auto code = GET_CODE (operands[0]); > + rtx tmp = operands[1]; > + > + /* If comparing against a non-zero vector we have to do a comparison first > + so we can have a != 0 comparison with the result. */ > + if (operands[2] != CONST0_RTX (<MODE>mode)) > + emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1], > + operands[2])); > + > + /* For 64-bit vectors we need no reductions. */ > + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode))) > + { > + /* Always reduce using a V4SI. */ > + rtx reduc = gen_lowpart (V4SImode, tmp); > + rtx res = gen_reg_rtx (V4SImode); > + emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc)); > + emit_move_insn (tmp, gen_lowpart (<MODE>mode, res)); > + } > + > + rtx val = gen_reg_rtx (DImode); > + emit_move_insn (val, gen_lowpart (DImode, tmp)); > + > + rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx); > + rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx); > + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3])); > + DONE;
Are you sure this is correct for the operands[2] != const0_rtx case? It looks like it uses the same comparison code for the vector comparison and the scalar comparison. E.g. if the pattern is passed a comparison: (eq (reg:V2SI x) (reg:V2SI y)) it looks like we'd generate a CMEQ for the x and y, then branch when the DImode bitcast of the CMEQ result equals zero. This means that we branch when no elements of x and y are equal, rather than when all elements of x and y are equal. E.g. for: { 1, 2 } == { 1, 2 } CMEQ will produce { -1, -1 }, the scalar comparison will be -1 == 0, and the branch won't be taken. ISTM it would be easier for the operands[2] != const0_rtx case to use EOR instead of a comparison. That gives a zero result if the input vectors are equal and a nonzero result if the input vectors are different. We can then branch on the result using CODE and const0_rtx. (Hope I've got that right.) Maybe that also removes the need for patch 18. Thanks, Richard > +}) > + > ;; Patterns comparing two vectors to produce a mask. > > (define_expand "vec_cmp<mode><mode>" > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-cbranch.c > b/gcc/testsuite/gcc.target/aarch64/vect-early-break-cbranch.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..c0363c3787270507d7902bb2ac0e39faef63a852 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-cbranch.c > @@ -0,0 +1,124 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */ > + > +#pragma GCC target "+nosve" > + > +#define N 640 > +int a[N] = {0}; > +int b[N] = {0}; > + > + > +/* > +** f1: > +** ... > +** cmgt v[0-9]+.4s, v[0-9]+.4s, #0 > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f1 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] > 0) > + break; > + } > +} > + > +/* > +** f2: > +** ... > +** cmge v[0-9]+.4s, v[0-9]+.4s, #0 > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f2 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] >= 0) > + break; > + } > +} > + > +/* > +** f3: > +** ... > +** cmeq v[0-9]+.4s, v[0-9]+.4s, #0 > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f3 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] == 0) > + break; > + } > +} > + > +/* > +** f4: > +** ... > +** cmtst v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f4 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] != 0) > + break; > + } > +} > + > +/* > +** f5: > +** ... > +** cmlt v[0-9]+.4s, v[0-9]+.4s, #0 > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f5 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] < 0) > + break; > + } > +} > + > +/* > +** f6: > +** ... > +** cmle v[0-9]+.4s, v[0-9]+.4s, #0 > +** umaxp v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** fmov x[0-9]+, d[0-9]+ > +** cbnz x[0-9]+, \.L[0-9]+ > +** ... > +*/ > +void f6 () > +{ > + for (int i = 0; i < N; i++) > + { > + b[i] += a[i]; > + if (a[i] <= 0) > + break; > + } > +}