Optimize multiply/add of DImode extended to TImode, PR target/103109. On power9 and power10 systems, we have instructions that support doing 64-bit integers converted to 128-bit integers and producing 128-bit results. This patch adds support to generate these instructions.
Previously GCC had define_expands to handle conversion of the 64-bit extend to 128-bit and multiply. This patch changes these define_expands to define_insn_and_split and then it provides combiner patterns to generate thes multiply/add instructions. To support using this optimization on power9, this patch extends the sign extend DImode to TImode to also run on power9 (added for PR target/104698). This patch needs the previous patch to add unsigned DImode to TImode conversion so that the combiner can combine the extend, multiply, and add instructions. I have built this patch on little endian power10, little endian power9, and big endian power8 systems. There were no regressions when I ran it. Can I install this patch into the GCC 13 master branch? 2022-05-13 Michael Meissner <meiss...@linux.ibm.com> gcc/ PR target/103109 * config/rs6000/rs6000.md (su_int32): New code attribute. (<u>mul<mode><dmode>3): Convert from define_expand to define_insn_and_split. (maddld<mode>4): Add generator function. (<u>mulditi3_<u>adddi3): New insn. (<u>mulditi3_add_const): New insn. (<u>mulditi3_<u>adddi3_upper): New insn. gcc/testsuite/ PR target/103109 * gcc.target/powerpc/pr103109.c: New test. --- gcc/config/rs6000/rs6000.md | 128 +++++++++++++++++++- gcc/testsuite/gcc.target/powerpc/pr103109.c | 62 ++++++++++ 2 files changed, 184 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr103109.c diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 2aba70393d8..83eacec57ba 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -667,6 +667,9 @@ (define_code_attr uns [(fix "") (float "") (unsigned_float "uns")]) +(define_code_attr su_int32 [(sign_extend "s32bit_cint_operand") + (zero_extend "c32bit_cint_operand")]) + ; Various instructions that come in SI and DI forms. ; A generic w/d attribute, for things like cmpw/cmpd. (define_mode_attr wd [(QI "b") @@ -3190,13 +3193,16 @@ (define_insn "<su>mulsi3_highpart_64" "mulhw<u> %0,%1,%2" [(set_attr "type" "mul")]) -(define_expand "<u>mul<mode><dmode>3" - [(set (match_operand:<DMODE> 0 "gpc_reg_operand") +(define_insn_and_split "<u>mul<mode><dmode>3" + [(set (match_operand:<DMODE> 0 "gpc_reg_operand" "=&r") (mult:<DMODE> (any_extend:<DMODE> - (match_operand:GPR 1 "gpc_reg_operand")) + (match_operand:GPR 1 "gpc_reg_operand" "r")) (any_extend:<DMODE> - (match_operand:GPR 2 "gpc_reg_operand"))))] + (match_operand:GPR 2 "gpc_reg_operand" "r"))))] "!(<MODE>mode == SImode && TARGET_POWERPC64)" + "#" + "&& 1" + [(pc)] { rtx l = gen_reg_rtx (<MODE>mode); rtx h = gen_reg_rtx (<MODE>mode); @@ -3205,9 +3211,10 @@ (define_expand "<u>mul<mode><dmode>3" emit_move_insn (gen_lowpart (<MODE>mode, operands[0]), l); emit_move_insn (gen_highpart (<MODE>mode, operands[0]), h); DONE; -}) +} + [(set_attr "length" "8")]) -(define_insn "*maddld<mode>4" +(define_insn "maddld<mode>4" [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") (plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") (match_operand:GPR 2 "gpc_reg_operand" "r")) @@ -3216,6 +3223,115 @@ (define_insn "*maddld<mode>4" "maddld %0,%1,%2,%3" [(set_attr "type" "mul")]) +(define_insn_and_split "*<u>mulditi3_<u>adddi3" + [(set (match_operand:TI 0 "gpc_reg_operand" "=&r") + (plus:TI + (mult:TI + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))))] + "TARGET_MADDLD && TARGET_POWERPC64" + "#" + "&& 1" + [(pc)] +{ + rtx dest = operands[0]; + rtx dest_hi = gen_highpart (DImode, dest); + rtx dest_lo = gen_lowpart (DImode, dest); + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = operands[3]; + rtx tmp_hi, tmp_lo; + + if (can_create_pseudo_p ()) + { + tmp_hi = gen_reg_rtx (DImode); + tmp_lo = gen_reg_rtx (DImode); + } + else + { + tmp_hi = dest_hi; + tmp_lo = dest_lo; + } + + emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3)); + emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3)); + + if (can_create_pseudo_p ()) + { + emit_move_insn (dest_hi, tmp_hi); + emit_move_insn (dest_lo, tmp_lo); + } + DONE; +} + [(set_attr "length" "8")]) + +;; Optimize 128-bit multiply with zero/sign extend and adding a constant. We +;; force the constant into a register to generate li, maddhd, and maddld, +;; instead of mulld, mulhd, addic, and addze. We can't combine this pattern +;; with the pattern that handles registers, since constants don't have a sign +;; or zero extend around them. +(define_insn_and_split "*<u>mulditi3_add_const" + [(set (match_operand:TI 0 "gpc_reg_operand" "=&r") + (plus:TI + (mult:TI + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) + (match_operand 3 "<su_int32>" "r")))] + "TARGET_MADDLD && TARGET_POWERPC64 +" + "#" + "&& 1" + [(pc)] +{ + rtx dest = operands[0]; + rtx dest_hi = gen_highpart (DImode, dest); + rtx dest_lo = gen_lowpart (DImode, dest); + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = force_reg (DImode, operands[3]); + rtx tmp_hi, tmp_lo; + + if (can_create_pseudo_p ()) + { + tmp_hi = gen_reg_rtx (DImode); + tmp_lo = gen_reg_rtx (DImode); + } + else + { + tmp_hi = dest_hi; + tmp_lo = dest_lo; + } + + emit_insn (gen_<u>mulditi3_<u>adddi3_upper (tmp_hi, op1, op2, op3)); + emit_insn (gen_maddlddi4 (tmp_lo, op1, op2, op3)); + + if (can_create_pseudo_p ()) + { + emit_move_insn (dest_hi, tmp_hi); + emit_move_insn (dest_lo, tmp_lo); + } + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "mul") + (set_attr "size" "64")]) + +(define_insn "<u>mulditi3_<u>adddi3_upper" + [(set (match_operand:DI 0 "gpc_reg_operand" "=r") + (truncate:DI + (lshiftrt:TI + (plus:TI + (mult:TI + (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r")) + (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r"))) + (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r"))) + (const_int 64))))] + "TARGET_MADDLD && TARGET_POWERPC64" + "maddhd<u> %0,%1,%2,%3" + [(set_attr "type" "mul") + (set_attr "size" "64")]) + (define_insn "udiv<mode>3" [(set (match_operand:GPR 0 "gpc_reg_operand" "=r") (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r") diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c new file mode 100644 index 00000000000..ae2cfb9eda7 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c @@ -0,0 +1,62 @@ +/* { dg-require-effective-target int128 } */ +/* { dg-require-effective-target power10_ok } */ +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ + +/* This test makes sure that GCC generates the maddhd, maddhdu, and maddld + power9 instructions when doing some forms of 64-bit integers converted to + 128-bit integers and used with multiply/add operations. */ + +__int128_t +s_mult_add (long long a, + long long b, + long long c) +{ + /* maddhd, maddld. */ + return ((__int128_t)a * (__int128_t)b) + (__int128_t)c; +} + +/* Test 32-bit constants that are loaded into GPRs instead of doing the + mulld/mulhd and then addic/addime or addc/addze. */ +__int128_t +s_mult_add_m10 (long long a, + long long b) +{ + /* maddhd, maddld. */ + return ((__int128_t)a * (__int128_t)b) - 10; +} + +__int128_t +s_mult_add_70000 (long long a, + long long b) +{ + /* maddhd, maddld. */ + return ((__int128_t)a * (__int128_t)b) + 70000; +} + +__uint128_t +u_mult_add (unsigned long long a, + unsigned long long b, + unsigned long long c) +{ + /* maddhd, maddld. */ + return ((__uint128_t)a * (__uint128_t)b) + (__uint128_t)c; +} + +__uint128_t +u_mult_add_0x80000000 (unsigned long long a, + unsigned long long b) +{ + /* maddhd, maddld. */ + return ((__uint128_t)a * (__uint128_t)b) + 0x80000000UL; +} + +/* { dg-final { scan-assembler-not {\maddc\M} } } */ +/* { dg-final { scan-assembler-not {\madde\M} } } */ +/* { dg-final { scan-assembler-not {\maddid\M} } } */ +/* { dg-final { scan-assembler-not {\maddme\M} } } */ +/* { dg-final { scan-assembler-not {\maddze\M} } } */ +/* { dg-final { scan-assembler-not {\mmulhd\M} } } */ +/* { dg-final { scan-assembler-not {\mmulld\M} } } */ +/* { dg-final { scan-assembler-times {\mmaddhd\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mmaddld\M} 5 } } */ -- 2.35.3 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com