Data-share write (ds_write) instructions do not necessarily complete the write to LDS immediately. When a write completes, LGKM_CNT is decremented. For now, we wait until LGKM_CNT reaches zero after each ds_write instruction.
This fixes a race condition in the case where LDS is read immediately after being written. This can happen with broadcast operations. This may be latent on mainline, since the broadcast machinery isn't present there yet. Nonetheless, I will apply shortly. Julian 2020-09-08 Julian Brown <jul...@codesourcery.com> gcc/ * config/gcn/gcn-valu.md (scatter<mode>_insn_1offset_ds<exec_scatter>): Add waitcnt. * config/gcn/gcn.md (*mov<mode>_insn, *movti_insn): Add waitcnt to ds_write alternatives. --- gcc/config/gcn/gcn-valu.md | 2 +- gcc/config/gcn/gcn.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 26559ff765e..e4d7f2a0f49 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -923,7 +923,7 @@ { addr_space_t as = INTVAL (operands[3]); static char buf[200]; - sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s", + sprintf (buf, "ds_write%%b2\t%%0, %%2 offset:%%1%s\;s_waitcnt\tlgkmcnt(0)", (AS_GDS_P (as) ? " gds" : "")); return buf; } diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index ed98d2d2706..aeb25fbb931 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -554,7 +554,7 @@ flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0 flat_store_dword\t%A0, %1%O0%g0 v_mov_b32\t%0, %1 - ds_write_b32\t%A0, %1%O0 + ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) s_mov_b32\t%0, %1 global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) @@ -582,7 +582,7 @@ flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0 flat_store%s0\t%A0, %1%O0%g0 v_mov_b32\t%0, %1 - ds_write%b0\t%A0, %1%O0 + ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) global_store%s0\t%A0, %1%O0%g0" @@ -611,7 +611,7 @@ # flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0 flat_store_dwordx2\t%A0, %1%O0%g0 - ds_write_b64\t%A0, %1%O0 + ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0) global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) global_store_dwordx2\t%A0, %1%O0%g0" @@ -667,7 +667,7 @@ # global_store_dwordx4\t%A0, %1%O0%g0 global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0) - ds_write_b128\t%A0, %1%O0 + ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0) ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)" "reload_completed && REG_P (operands[0]) -- 2.28.0