Hi All, The usdot operation is common in video encoder and decoders including some of the most widely used ones.
This patch adds a +dotprod version of the optab as a fallback for when you do have sdot but not usdot available. The fallback works by adding a bias to the unsigned argument to convert it to a signed value and then correcting for the bias later on. Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is signed (assuming both are 8-bit values). Because the range of a signed byte is only to 127 we split the bias correction into: (x - 128)y + 127y + y Concretely for: #define N 480 #define SIGNEDNESS_1 unsigned #define SIGNEDNESS_2 signed #define SIGNEDNESS_3 signed #define SIGNEDNESS_4 unsigned SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, SIGNEDNESS_4 char *restrict b) { for (__INTPTR_TYPE__ i = 0; i < N; ++i) { int av = a[i]; int bv = b[i]; SIGNEDNESS_2 short mult = av * bv; res += mult; } return res; } we generate: f: ... mov z6.b, #0 mov z5.b, #127 mov z4.b, #1 mov z3.b, #-128 ptrue p1.b, all movi v0.4s, 0 .L2: ld1b z2.b, p0/z, [x1, x3] ld1b z1.b, p0/z, [x2, x3] incb x3 sel z1.b, p0, z1.b, z6.b whilelo p0.b, w3, w4 sub z1.b, z1.b, z3.b sdot z0.s, z1.b, z2.b sdot z0.s, z5.b, z2.b sdot z0.s, z4.b, z2.b b.any .L2 instead of: f: ... .L2: ld1sb z2.h, p0/z, [x1, x3] punpklo p1.h, p0.b ld1b z0.h, p0/z, [x2, x3] add x3, x3, x5 mul z0.h, p2/m, z0.h, z2.h sunpklo z2.s, z0.h sunpkhi z0.s, z0.h add z1.s, p1/m, z1.s, z2.s punpkhi p1.h, p0.b whilelo p0.h, w3, w4 add z1.s, p1/m, z1.s, z0.s b.any .L2 The new sequence is significantly faster as the operations it uses are well optimized. Note that execution tests are already in the mid-end testsuite. Thanks to James Greenhalgh for the tip-off. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (@<sur>dot_prod<vsi2qi>): Generate fallback or call original isns ... (@<sur>dot_prod<vsi2qi>_insn): ...here. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/vusdot-autovec_2.c: New test. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>" [(set_attr "movprfx" "*,yes")] ) -(define_insn "@<sur>dot_prod<vsi2qi>" +(define_insn "@<sur>dot_prod<vsi2qi>_insn" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY @@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>" [(set_attr "movprfx" "*,yes")] ) +(define_expand "@<sur>dot_prod<vsi2qi>" + [(set (match_operand:VNx4SI_ONLY 0 "register_operand") + (plus:VNx4SI_ONLY + (unspec:VNx4SI_ONLY + [(match_operand:<VSI2QI> 1 "register_operand") + (match_operand:<VSI2QI> 2 "register_operand")] + DOTPROD_US_ONLY) + (match_operand:VNx4SI_ONLY 3 "register_operand")))] + "TARGET_SVE || TARGET_SVE_I8MM" +{ + if (TARGET_SVE_I8MM) + { + emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + + machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode); + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1); + rtx signbit = gen_int_mode (val, elemmode); + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + rtx tmp = gen_reg_rtx (<VSI2QI>mode); + rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode, + gen_int_mode (val - 1, elemmode)); + rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode)); + rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit); + c1 = force_reg (<VSI2QI>mode, c1); + c2 = force_reg (<VSI2QI>mode, c2); + dup = force_reg (<VSI2QI>mode, dup); + emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup)); + emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3])); + emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1)); + emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2)); + DONE; +}) + (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c new file mode 100644 index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#pragma GCC target "+noi8mm" + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-not {\tusdot\t} } } */ +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */ --
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>" [(set_attr "movprfx" "*,yes")] ) -(define_insn "@<sur>dot_prod<vsi2qi>" +(define_insn "@<sur>dot_prod<vsi2qi>_insn" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY (unspec:VNx4SI_ONLY @@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>" [(set_attr "movprfx" "*,yes")] ) +(define_expand "@<sur>dot_prod<vsi2qi>" + [(set (match_operand:VNx4SI_ONLY 0 "register_operand") + (plus:VNx4SI_ONLY + (unspec:VNx4SI_ONLY + [(match_operand:<VSI2QI> 1 "register_operand") + (match_operand:<VSI2QI> 2 "register_operand")] + DOTPROD_US_ONLY) + (match_operand:VNx4SI_ONLY 3 "register_operand")))] + "TARGET_SVE || TARGET_SVE_I8MM" +{ + if (TARGET_SVE_I8MM) + { + emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + + machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode); + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1); + rtx signbit = gen_int_mode (val, elemmode); + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + rtx tmp = gen_reg_rtx (<VSI2QI>mode); + rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode, + gen_int_mode (val - 1, elemmode)); + rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode)); + rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit); + c1 = force_reg (<VSI2QI>mode, c1); + c2 = force_reg (<VSI2QI>mode, c2); + dup = force_reg (<VSI2QI>mode, dup); + emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup)); + emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3])); + emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1)); + emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2)); + DONE; +}) + (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>" [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w") (plus:VNx4SI_ONLY diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c new file mode 100644 index 0000000000000000000000000000000000000000..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#pragma GCC target "+noi8mm" + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-not {\tusdot\t} } } */ +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */