Hi All, The usdot operation is common in video encoder and decoders including some of the most widely used ones.
This patch adds a +dotprod version of the optab as a fallback for when you do have sdot but not usdot available. The fallback works by adding a bias to the unsigned argument to convert it to a signed value and then correcting for the bias later on. Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is signed (assuming both are 8-bit values). Because the range of a signed byte is only to 127 we split the bias correction into: (x - 128)y + 127y + y Concretely for: #define N 480 #define SIGNEDNESS_1 unsigned #define SIGNEDNESS_2 signed #define SIGNEDNESS_3 signed #define SIGNEDNESS_4 unsigned SIGNEDNESS_1 int __attribute__ ((noipa)) f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, SIGNEDNESS_4 char *restrict b) { for (__INTPTR_TYPE__ i = 0; i < N; ++i) { int av = a[i]; int bv = b[i]; SIGNEDNESS_2 short mult = av * bv; res += mult; } return res; } we generate: movi v5.16b, 0x7f mov x3, 0 movi v4.16b, 0x1 movi v3.16b, 0xffffffffffffff80 movi v0.4s, 0 .L2: ldr q2, [x2, x3] ldr q1, [x1, x3] add x3, x3, 16 sub v2.16b, v2.16b, v3.16b sdot v0.4s, v2.16b, v1.16b sdot v0.4s, v5.16b, v1.16b sdot v0.4s, v4.16b, v1.16b cmp x3, 480 bne .L2 instead of: movi v0.4s, 0 mov x3, 0 .L2: ldr q2, [x1, x3] ldr q1, [x2, x3] add x3, x3, 16 sxtl v4.8h, v2.8b sxtl2 v3.8h, v2.16b uxtl v2.8h, v1.8b uxtl2 v1.8h, v1.16b mul v2.8h, v2.8h, v4.8h mul v1.8h, v1.8h, v3.8h saddw v0.4s, v0.4s, v2.4h saddw2 v0.4s, v0.4s, v2.8h saddw v0.4s, v0.4s, v1.4h saddw2 v0.4s, v0.4s, v1.8h cmp x3, 480 bne .L2 The new sequence is significantly faster as the operations it uses are well optimized. Note that execution tests are already in the mid-end testsuite. Thanks to James Greenhalgh for the tip-off. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (usdot_prod<vsi2qi>): Generate fallback or call original isns ... (usdot_prod<vsi2qi>_insn): ...here. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>" ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. -(define_insn "usdot_prod<vsi2qi>" +(define_insn "usdot_prod<vsi2qi>_insn" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w") @@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>" [(set_attr "type" "neon_dot<q>")] ) +;; usdot auto-vec fallback code +(define_expand "usdot_prod<vsi2qi>" + [(set (match_operand:VS 0 "register_operand") + (plus:VS + (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") + (match_operand:<VSI2QI> 2 "register_operand")] + UNSPEC_USDOT) + (match_operand:VS 3 "register_operand")))] + "TARGET_DOTPROD || TARGET_I8MM" +{ + if (TARGET_I8MM) + { + emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + + machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode); + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1); + rtx signbit = gen_int_mode (val, elemmode); + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + rtx tmp = gen_reg_rtx (<VSI2QI>mode); + rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode, + gen_int_mode (val - 1, elemmode)); + rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode)); + rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit); + c1 = force_reg (<VSI2QI>mode, c1); + c2 = force_reg (<VSI2QI>mode, c2); + dup = force_reg (<VSI2QI>mode, dup); + emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup)); + emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3])); + emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1)); + emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2)); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_<sur>dot_lane<vsi2qi>" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c new file mode 100644 index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-not {\tusdot\t} } } */ +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */ --
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -623,7 +623,7 @@ (define_insn "<sur>dot_prod<vsi2qi>" ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot ;; (vector) Dot Product operation and the vectorized optab. -(define_insn "usdot_prod<vsi2qi>" +(define_insn "usdot_prod<vsi2qi>_insn" [(set (match_operand:VS 0 "register_operand" "=w") (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand" "w") @@ -635,6 +635,43 @@ (define_insn "usdot_prod<vsi2qi>" [(set_attr "type" "neon_dot<q>")] ) +;; usdot auto-vec fallback code +(define_expand "usdot_prod<vsi2qi>" + [(set (match_operand:VS 0 "register_operand") + (plus:VS + (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand") + (match_operand:<VSI2QI> 2 "register_operand")] + UNSPEC_USDOT) + (match_operand:VS 3 "register_operand")))] + "TARGET_DOTPROD || TARGET_I8MM" +{ + if (TARGET_I8MM) + { + emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1], + operands[2], operands[3])); + DONE; + } + + machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode); + HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1); + rtx signbit = gen_int_mode (val, elemmode); + rtx t1 = gen_reg_rtx (<MODE>mode); + rtx t2 = gen_reg_rtx (<MODE>mode); + rtx tmp = gen_reg_rtx (<VSI2QI>mode); + rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode, + gen_int_mode (val - 1, elemmode)); + rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode)); + rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit); + c1 = force_reg (<VSI2QI>mode, c1); + c2 = force_reg (<VSI2QI>mode, c2); + dup = force_reg (<VSI2QI>mode, dup); + emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup)); + emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3])); + emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1)); + emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2)); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product ;; indexed operations. (define_insn "aarch64_<sur>dot_lane<vsi2qi>" diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c new file mode 100644 index 0000000000000000000000000000000000000000..acd8e36209690386d021df72f1467a696750ac3e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+noi8mm+dotprod" } */ + +#define N 480 +#define SIGNEDNESS_1 unsigned +#define SIGNEDNESS_2 signed +#define SIGNEDNESS_3 signed +#define SIGNEDNESS_4 unsigned + +SIGNEDNESS_1 int __attribute__ ((noipa)) +f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a, + SIGNEDNESS_4 char *restrict b) +{ + for (__INTPTR_TYPE__ i = 0; i < N; ++i) + { + int av = a[i]; + int bv = b[i]; + SIGNEDNESS_2 short mult = av * bv; + res += mult; + } + return res; +} + +/* { dg-final { scan-assembler-not {\tusdot\t} } } */ +/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */