This patch addresses an issue when compiling the MMA optimized DGEMM kernel in OpenBLAS. The MMA code uses all 8 accumulators, which overlap all vs0-vs31 vector registers. Current trunk assigns one of the normal vector inputs to one of the MMA instructions, which forces us to spill one of the accumulators to memory, leading to poor performance. The solution here is to replace the "wa" constraints for the vector input operands in the MMA instruction patterns with "v,?d" so that we disparage using vs0-vs31 and prefer using the altivec registers vs32-vs63 instead, which fixes the dgemm performance issue.
This passed bootstrap and regtesting with no regressions on powerpc64le-linux. Ok for trunk and after a few days of burn-in to the GCC12 release branch? Technically, the same issue exists in GCC11 and GCC10, but the RA assignment is OK with the current code, so unless/until we have a test case that exhibits the issue, I'm only asking for a backport to GCC12 which does show the performance problem. Peter gcc/ PR target/105556 * config/rs6000/mma.md (mma_<vv>, mma_<avv>, mma_<pv>, mma_<apv>, mma_<vvi4i4i8>, mma_<avvi4i4i8>, mma_<vvi4i4i2>, mma_<avvi4i4i2>, mma_<vvi4i4>, mma_<avvi4i4>, mma_<pvi4i2>, mma_<apvi4i2>, mma_<vvi4i4i4>, mma_<avvi4i4i4>): Replace "wa" constraint with "v,?d". diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index 907c9d6d516..9c9920870e4 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -490,50 +490,50 @@ (define_insn "mma_xxsetaccz" [(set_attr "type" "mma")]) (define_insn "mma_<vv>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d")] MMA_VV))] "TARGET_MMA" "<vv> %A0,%x1,%x2" [(set_attr "type" "mma")]) (define_insn "mma_<avv>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:V16QI 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d")] MMA_AVV))] "TARGET_MMA" "<avv> %A0,%x2,%x3" [(set_attr "type" "mma")]) (define_insn "mma_<pv>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d")] MMA_PV))] "TARGET_MMA" "<pv> %A0,%x1,%x2" [(set_attr "type" "mma")]) (define_insn "mma_<apv>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:OO 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:OO 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d")] MMA_APV))] "TARGET_MMA" "<apv> %A0,%x2,%x3" [(set_attr "type" "mma")]) (define_insn "mma_<vvi4i4i8>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:SI 3 "const_0_to_15_operand" "n") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "u8bit_cint_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "u8bit_cint_operand" "n,n,n,n")] MMA_VVI4I4I8))] "TARGET_MMA" "<vvi4i4i8> %A0,%x1,%x2,%3,%4,%5" @@ -541,13 +541,13 @@ (define_insn "mma_<vvi4i4i8>" (set_attr "prefixed" "yes")]) (define_insn "mma_<avvi4i4i8>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_15_operand" "n") - (match_operand:SI 6 "u8bit_cint_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:V16QI 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 6 "u8bit_cint_operand" "n,n,n,n")] MMA_AVVI4I4I8))] "TARGET_MMA" "<avvi4i4i8> %A0,%x2,%x3,%4,%5,%6" @@ -555,12 +555,12 @@ (define_insn "mma_<avvi4i4i8>" (set_attr "prefixed" "yes")]) (define_insn "mma_<vvi4i4i2>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:SI 3 "const_0_to_15_operand" "n") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_3_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_3_operand" "n,n,n,n")] MMA_VVI4I4I2))] "TARGET_MMA" "<vvi4i4i2> %A0,%x1,%x2,%3,%4,%5" @@ -568,13 +568,13 @@ (define_insn "mma_<vvi4i4i2>" (set_attr "prefixed" "yes")]) (define_insn "mma_<avvi4i4i2>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_15_operand" "n") - (match_operand:SI 6 "const_0_to_3_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:V16QI 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 6 "const_0_to_3_operand" "n,n,n,n")] MMA_AVVI4I4I2))] "TARGET_MMA" "<avvi4i4i2> %A0,%x2,%x3,%4,%5,%6" @@ -582,11 +582,11 @@ (define_insn "mma_<avvi4i4i2>" (set_attr "prefixed" "yes")]) (define_insn "mma_<vvi4i4>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:SI 3 "const_0_to_15_operand" "n") - (match_operand:SI 4 "const_0_to_15_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n")] MMA_VVI4I4))] "TARGET_MMA" "<vvi4i4> %A0,%x1,%x2,%3,%4" @@ -594,12 +594,12 @@ (define_insn "mma_<vvi4i4>" (set_attr "prefixed" "yes")]) (define_insn "mma_<avvi4i4>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_15_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:V16QI 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_15_operand" "n,n,n,n")] MMA_AVVI4I4))] "TARGET_MMA" "<avvi4i4> %A0,%x2,%x3,%4,%5" @@ -607,11 +607,11 @@ (define_insn "mma_<avvi4i4>" (set_attr "prefixed" "yes")]) (define_insn "mma_<pvi4i2>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:SI 3 "const_0_to_15_operand" "n") - (match_operand:SI 4 "const_0_to_3_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 4 "const_0_to_3_operand" "n,n,n,n")] MMA_PVI4I2))] "TARGET_MMA" "<pvi4i2> %A0,%x1,%x2,%3,%4" @@ -619,12 +619,12 @@ (define_insn "mma_<pvi4i2>" (set_attr "prefixed" "yes")]) (define_insn "mma_<apvi4i2>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:OO 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_3_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:OO 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_3_operand" "n,n,n,n")] MMA_APVI4I2))] "TARGET_MMA" "<apvi4i2> %A0,%x2,%x3,%4,%5" @@ -632,12 +632,12 @@ (define_insn "mma_<apvi4i2>" (set_attr "prefixed" "yes")]) (define_insn "mma_<vvi4i4i4>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "wa") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:SI 3 "const_0_to_15_operand" "n") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_15_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:V16QI 1 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 2 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_15_operand" "n,n,n,n")] MMA_VVI4I4I4))] "TARGET_MMA" "<vvi4i4i4> %A0,%x1,%x2,%3,%4,%5" @@ -645,13 +645,13 @@ (define_insn "mma_<vvi4i4i4>" (set_attr "prefixed" "yes")]) (define_insn "mma_<avvi4i4i4>" - [(set (match_operand:XO 0 "fpr_reg_operand" "=&d") - (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0") - (match_operand:V16QI 2 "vsx_register_operand" "wa") - (match_operand:V16QI 3 "vsx_register_operand" "wa") - (match_operand:SI 4 "const_0_to_15_operand" "n") - (match_operand:SI 5 "const_0_to_15_operand" "n") - (match_operand:SI 6 "const_0_to_15_operand" "n")] + [(set (match_operand:XO 0 "fpr_reg_operand" "=&d,&d,&d,&d") + (unspec:XO [(match_operand:XO 1 "fpr_reg_operand" "0,0,0,0") + (match_operand:V16QI 2 "vsx_register_operand" "v,v,?d,?d") + (match_operand:V16QI 3 "vsx_register_operand" "v,?d,v,?d") + (match_operand:SI 4 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 5 "const_0_to_15_operand" "n,n,n,n") + (match_operand:SI 6 "const_0_to_15_operand" "n,n,n,n")] MMA_AVVI4I4I4))] "TARGET_MMA" "<avvi4i4i4> %A0,%x2,%x3,%4,%5,%6"