There are still issues with MI300, some which get resolved by adding s_nop.

One case where it is exactly known where the s_nop fixes a fail is for
libgomp.c-c++-common/task-detach-10.c, where libgomp/single.c's
GOMP_single_start() never returns 1, such that 'omp single' is
never executed. Adding an s_nop fixes it; namely it has to be added at

  v_cmp_eq_u64   vcc, v[4:5], v[8:9]    ;, tmp711, _4
  s_nop  0x0  ; ← now added
  v_mov_b32      v0, vcc_lo     ; tmp744, tmp714

The other case is taken from the manual and I have no idea whether it
actually has an effect (both in how often it gets inserted and whether
it fixes any testcase); still, it makes sense to have it.

Looking at the number of fails for check-target-libgomp on an MI300A
system (x86-64 with CDNA3 GPU), the fails are now down to:

# of expected passes            31299
# of unexpected failures        72
# of unexpected successes       1
# of expected failures          706
# of unresolved testcases       5
# of unsupported tests          867

i.e. 0.2% fail. Or - looking only at the 'execution test' lines, 55 fail
with a total of (8104+55) PASS+FAIL tests; that's 0.7%. Some fails are
known, e.g. 9 link fails because I don't have a gfx{900,…} multilib; some
others are also known fails. Still, most execution-test fails shouldn't
be there ...

* * *

Next step: Identify those libgomp tests which start working when inserting
more s_nop (e.g. at least one 1 or 2). But there are also other known issues,
which are not fixed by even 5 s_nop - and require another solution.

Tobias

PS: One of such fails is for 'indirect' combined with 'target teams'.
It is not quite clear whether any other issue contributes to it, but
there is a known race. See https://gcc.gnu.org/PR114445 (comment 0 for
the known issue, comment 1 for the MI300 issue).
gcn: Add more s_nop for MI300

Implement another case where MI300 requires s_nop according to the spec,
add a comment why one case does not need to be handled. And add one case
where an s_nop is required but seems to be not mentioned in the spec.

gcc/ChangeLog:

	* config/gcn/gcn.cc (gcn_cmpx_insn_p): Rename to ...
	(gcn_v_cmpx_insn_p): ... this. Simplify.
	(gcn_v_cmp_insn_p): New.
	(gcn_md_reorg): Add two new conditions for MI300.

 gcc/config/gcn/gcn.cc | 87 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 31 deletions(-)

diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 8959118b869..467814d84ae 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5796,36 +5796,18 @@ gcn_libc_has_function (enum function_class fn_class,
    note: this will also match 'v_cmp %E1 vcc'.  */
 
 static bool
-gcn_cmpx_insn_p (attr_type type)
+gcn_v_cmpx_insn_p (attr_type type)
 {
-  switch (type)
-    {
-    case TYPE_VOPC:
-      return true;
-    case TYPE_MUBUF:
-    case TYPE_MTBUF:
-    case TYPE_FLAT:
-    case TYPE_VOP3P_MAI:
-    case TYPE_UNKNOWN:
-    case TYPE_SOP1:
-    case TYPE_SOP2:
-    case TYPE_SOPK:
-    case TYPE_SOPC:
-    case TYPE_SOPP:
-    case TYPE_SMEM:
-    case TYPE_DS:
-    case TYPE_VOP2:
-    case TYPE_VOP1:
-    case TYPE_VOP3A:
-    case TYPE_VOP3B:
-    case TYPE_VOP_SDWA:
-    case TYPE_VOP_DPP:
-    case TYPE_MULT:
-    case TYPE_VMULT:
-      return false;
-    }
-  gcc_unreachable ();
-  return false;
+  return type == TYPE_VOPC;
+}
+
+/* Identify V_CMP from the "type" attribute;
+   note: this will also match all V_CMPX.  */
+
+static bool
+gcn_v_cmp_insn_p (attr_type type)
+{
+  return type == TYPE_VOPC || type == TYPE_VOP3A;
 }
 
 /* Identify VMEM instructions from their "type" attribute.  */
@@ -6356,19 +6338,62 @@ gcn_md_reorg (void)
 		   reg_class_contents[(int)VCC_CONDITIONAL_REG])))
 	    nops_rqd = ivccwait - prev_insn->age;
 
+	  /* NOTE: The following condition for adding wait state exists, but
+	     GCC does not access the special registers using their SGPR#.
+	     Thus, no action is required here.  The following wait-state
+	     condition exists at least for VEGA/gfx900+ to CDNA3:
+		Mixed use of VCC: alias vs. SGPR# - v_readlane,
+		v_readfirstlane, v_cmp, v_add_*i/u, v_sub_*i/u, v_div_*scale
+		followed by VALU reads VCC as constant requires 1 wait state.
+		(As carry-in, it requires none.)
+		[VCC can be accessed by name or logical SGPR that holds it.]  */
+
+	  /* Testing indicates that CDNA3 requires an s_nop between
+	     e.g. 'v_cmp_eq_u64 vcc, v[4:5], v[8:9]' and 'v_mov_b32 v0, vcc_lo'.
+	     Thus: add it between v_cmp writing VCC and VALU read of VCC.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 1
+	      && iunit == UNIT_VECTOR
+	      && (hard_reg_set_intersect_p
+		  (depregs, reg_class_contents[(int)VCC_CONDITIONAL_REG]))
+	      && gcn_v_cmp_insn_p (prev_insn->type))
+	    nops_rqd = 1 - prev_insn->age;
+
+	  /* CDNA3: VALU writes VGPR/VCC: v_readlane, v_readfirstlane, v_cmp,
+	     v_add_*i/u, v_sub_*i/u, v_div_*scale - followed by:
+	     - VALU reads SGPR as constant requires 1 waite state
+	     - VALU reads SGPR as carry-in requires no waite state
+	     - v_readlane/v_writelane reads SGPR as lane select requires 4 wait
+	       states.  */
+	  if (TARGET_CDNA3_NOPS
+	      && (prev_insn->age + nops_rqd) < 4
+	      && prev_insn->unit == UNIT_VECTOR
+	      && (get_attr_laneselect (prev_insn->insn) == LANESELECT_READ
+		  || gcn_v_cmp_insn_p (prev_insn->type)
+		  || prev_insn->type == TYPE_VOP2
+		  || prev_insn->type == TYPE_VOP3B)
+	      && hard_reg_set_intersect_p
+		   (depregs, reg_class_contents[(int) SGPR_REGS]))
+	    {
+	      if (get_attr_laneselect (insn) != LANESELECT_NO)
+		nops_rqd = 4 - prev_insn->age;
+	      else if ((prev_insn->age + nops_rqd) < 1)
+		nops_rqd = 1 - prev_insn->age;
+	    }
+
 	  /* CDNA3: v_cmpx followed by
 	     - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
 	     - VALU reads EXEC as constant requires 2 wait states
 	     - other VALU requires no wait state  */
 	  if (TARGET_CDNA3_NOPS
 	      && (prev_insn->age + nops_rqd) < 4
-	      && gcn_cmpx_insn_p (prev_insn->type)
+	      && gcn_v_cmpx_insn_p (prev_insn->type)
 	      && get_attr_laneselect (insn) != LANESELECT_NO)
 	    nops_rqd = 4 - prev_insn->age;
 	  else if (TARGET_CDNA3_NOPS
 		   && (prev_insn->age + nops_rqd) < 2
 		   && iunit == UNIT_VECTOR
-		   && gcn_cmpx_insn_p (prev_insn->type)
+		   && gcn_v_cmpx_insn_p (prev_insn->type)
 		   && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
 	    nops_rqd = 2 - prev_insn->age;
 

Reply via email to