Andrew Stubbs wrote:
On 24/07/2025 16:49, Tobias Burnus wrote:
Andrew Stubbs wrote:
On 24/07/2025 14:25, Tobias Burnus wrote:
+/* Device requires CDNA1-style manually inserted wait states for
AVGPRs. */
+#define TARGET_AVGPR_CDNA3_NOPS TARGET_CDNA3
This is not for CDNA1, and not for AVGPRS.
I have deleted it and use now TARGET_CDNA3 directly.
The only thing wrong with what you had before was the name (and the
comment was an unmodified cut-and-paste).
OK, I misunderstood the comment.
+ /* RDNA 2, 3 and 3.5 require no manually inserted wait states. */
+ if (TARGET_RDNA2_PLUS)
+ return;
But, this is another "magic" arch check. Please use a named feature
flag; perhaps "TARGET_MANUALLY_INSERTED_WAIT_STATES".
I went for TARGET_NO_MANUAL_NOPS to be a bit more consistent with the
other names.
Tobias
gcc/config/gcn/gcn-opts.h | 5 +
gcn: Add "s_nop"s for MI300
MI300 requires some additional s_nop to be added between some instructions.
* As 'v_readlane' and 'v_writelane' have to be distinguished, the
'laneselect' attribute was changed from no/yes to no/read/write.
* Add some missing 'laneselect' attributes for v_(read,write)lane.
* Replace 'delayeduse' by 'flatmemaccess' which is more explicit,
especially as some uses have to destinguished more details.
(Alongside, one off-by-two delayeduse has been fixed.)
On the other hand, RDNA 2, 3, and 3.5 do not require any added s_nop;
thus, there is no need to walk the instructions for them to insert
pointless S_NOP. (RDNA4 (not yet in GCC) requires it in a few cases.)
gcc/ChangeLog:
* config/gcn/gcn-opts.h (TARGET_NO_MANUAL_NOPS,
TARGET_CDNA3_NOPS): Define.
* config/gcn/gcn.md (define_attr "laneselect): Change 'yes' to
'read' and 'write'.
(define_attr "flatmemaccess"): Add with values store, storex34,
load, atomic, atomicwait, cmpswapx2, and no. Replacing ...
(define_attr "delayeduse"): Remove.
(define_attr "transop"): Add with values yes and no.
(various insns): Update 'laneselect', add flatmemaccess and transop,
remove delayeduse; fixing an issue for s_load_dwordx4 vs.
flat_store_dwordx4 related to delayeduse (now: flatmemaccess).
* config/gcn/gcn-valu.md: Update laneselect attribute and add
flatmemaccess.
* config/gcn/gcn.cc (gcn_cmpx_insn_p): New.
(gcn_md_reorg): Update for MI300 to add additional s_nop.
Skip s_nop-insertion part for RDNA{2,3}; add "VALU writes EXEC
followed by VALU DPP" unconditionally for CDNA2/CDNA3/GCN5.
gcc/config/gcn/gcn-valu.md | 27 +++--
gcc/config/gcn/gcn.cc | 168 ++++++++++++++++++++++++++++--
gcc/config/gcn/gcn.md | 249 ++++++++++++++++++++++++---------------------
4 files changed, 312 insertions(+), 137 deletions(-)
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index 0bfc7869eef..fe68678bd02 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -68,36 +68,41 @@ enum hsaco_attr_type
/* There are load/store instructions for AVGPRS. */
#define TARGET_AVGPR_MEMOPS TARGET_CDNA2_PLUS
/* AVGPRS may have their own register file, or be combined with VGPRS. */
#define TARGET_AVGPR_COMBINED TARGET_CDNA2_PLUS
/* global_load/store has reduced offset. */
#define TARGET_11BIT_GLOBAL_OFFSET TARGET_RDNA2_PLUS
/* The work item details are all encoded into v0. */
//#define TARGET_PACKED_WORK_ITEMS TARGET_PACKED_WORK_ITEMS
/* CDNA2 load/store costs are reduced.
* TODO: what does this mean? */
#define TARGET_CDNA2_MEM_COSTS TARGET_CDNA2_PLUS
/* Wave32 devices running in wave64 compatibility mode. */
#define TARGET_WAVE64_COMPAT TARGET_RDNA2_PLUS
/* RDNA devices have different DPP with reduced capabilities. */
#define TARGET_DPP_FULL !TARGET_RDNA2_PLUS
#define TARGET_DPP16 TARGET_RDNA2_PLUS
#define TARGET_DPP8 TARGET_RDNA2_PLUS
+/* Device requires no manually inserted wait states; that's the
+ case for RDNA 2, 3 and 3.5 (but not for RNDA 4). */
+#define TARGET_NO_MANUAL_NOPS TARGET_RDNA2_PLUS
/* Device requires CDNA1-style manually inserted wait states for AVGPRs. */
#define TARGET_AVGPR_CDNA1_NOPS TARGET_CDNA1
+/* Device requires CDNA3-style manually inserted wait states. */
+#define TARGET_CDNA3_NOPS TARGET_CDNA3
/* Whether to use the 'globally coherent' (glc) or the 'scope' (sc0) flag
for non-scalar memory operations. The string starts on purpose with a space.
Note: for scalar memory operations (i.e. 's_...'), 'glc' is still used.
CDNA3 also uses 'nt' instead of 'slc' and 'sc1' instead of 'scc'; however,
there is no non-scalar user so far. */
#define TARGET_GLC_NAME (TARGET_CDNA3 ? " sc0" : " glc")
/* The metadata on different devices need different granularity. */
#define TARGET_VGPR_GRANULARITY \
(TARGET_RDNA3 ? 12 \
: TARGET_RDNA2_PLUS || TARGET_CDNA2_PLUS ? 8 \
: 4)
/* This mostly affects the metadata. */
#define TARGET_ARCHITECTED_FLAT_SCRATCH (TARGET_RDNA3 || TARGET_CDNA3)
/* Device has Sub-DWord Addressing instrucions. */
#define TARGET_SDWA (!TARGET_RDNA3)
/* Different devices uses different cache control instructions. */
#define TARGET_WBINVL1_CACHE (!TARGET_RDNA2_PLUS && !TARGET_CDNA3)
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 3899117f271..09943293293 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -797,95 +797,95 @@
; We allow source to be scratch.
;
; FIXME these should take A immediates
(define_insn "*vec_set<mode>"
[(set (match_operand:V_1REG 0 "register_operand" "= v")
(vec_merge:V_1REG
(vec_duplicate:V_1REG
(match_operand:<SCALAR_MODE> 1 "register_operand" " Sv"))
(match_operand:V_1REG 3 "gcn_register_or_unspec_operand" " U0")
(ashift (const_int 1)
(match_operand:SI 2 "gcn_alu_operand" "SvB"))))]
""
"v_writelane_b32 %0, %1, %2"
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
; FIXME: 64bit operations really should be splitters, but I am not sure how
; to represent vertical subregs.
(define_insn "*vec_set<mode>"
[(set (match_operand:V_2REG 0 "register_operand" "= v")
(vec_merge:V_2REG
(vec_duplicate:V_2REG
(match_operand:<SCALAR_MODE> 1 "register_operand" " Sv"))
(match_operand:V_2REG 3 "gcn_register_or_unspec_operand" " U0")
(ashift (const_int 1)
(match_operand:SI 2 "gcn_alu_operand" "SvB"))))]
""
"v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2"
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_expand "vec_set<mode>"
[(set (match_operand:V_MOV 0 "register_operand")
(vec_merge:V_MOV
(vec_duplicate:V_MOV
(match_operand:<SCALAR_MODE> 1 "register_operand"))
(match_dup 0)
(ashift (const_int 1) (match_operand:SI 2 "gcn_alu_operand"))))]
"")
(define_insn "*vec_set<mode>_1"
[(set (match_operand:V_1REG 0 "register_operand" "=v")
(vec_merge:V_1REG
(vec_duplicate:V_1REG
(match_operand:<SCALAR_MODE> 1 "register_operand" "Sv"))
(match_operand:V_1REG 3 "gcn_register_or_unspec_operand" "U0")
(match_operand:SI 2 "const_int_operand" " i")))]
"((unsigned) exact_log2 (INTVAL (operands[2])) < GET_MODE_NUNITS (<MODE>mode))"
{
operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
return "v_writelane_b32 %0, %1, %2";
}
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "*vec_set<mode>_1"
[(set (match_operand:V_2REG 0 "register_operand" "=v")
(vec_merge:V_2REG
(vec_duplicate:V_2REG
(match_operand:<SCALAR_MODE> 1 "register_operand" "Sv"))
(match_operand:V_2REG 3 "gcn_register_or_unspec_operand" "U0")
(match_operand:SI 2 "const_int_operand" " i")))]
"((unsigned) exact_log2 (INTVAL (operands[2])) < GET_MODE_NUNITS (<MODE>mode))"
{
operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2])));
return "v_writelane_b32 %L0, %L1, %2\;v_writelane_b32 %H0, %H1, %2";
}
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "write")])
(define_insn "vec_duplicate<mode><exec>"
[(set (match_operand:V_1REG 0 "register_operand" "=v")
(vec_duplicate:V_1REG
(match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvB")))]
""
"v_mov_b32\t%0, %1"
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
(define_insn "vec_duplicate<mode><exec>"
[(set (match_operand:V_2REG 0 "register_operand" "= v")
(vec_duplicate:V_2REG
(match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvDB")))]
""
"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1"
[(set_attr "type" "vop3a")
@@ -896,59 +896,59 @@
(vec_duplicate:V_4REG
(match_operand:<SCALAR_MODE> 1 "gcn_alu_operand" "SvDB")))]
""
"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\;v_mov_b32\t%J0, %J1\;v_mov_b32\t%K0, %K1"
[(set_attr "type" "mult")
(set_attr "length" "32")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg")
(vec_select:<SCALAR_MODE>
(match_operand:V_1REG 1 "register_operand" " v")
(parallel [(match_operand:SI 2 "gcn_alu_operand" "SvB")])))]
""
"v_readlane_b32 %0, %1, %2"
[(set_attr "type" "vop3a")
(set_attr "length" "8")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
(vec_select:<SCALAR_MODE>
(match_operand:V_2REG 1 "register_operand" " v")
(parallel [(match_operand:SI 2 "gcn_alu_operand" " SvB")])))]
""
"v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2"
[(set_attr "type" "vmult")
(set_attr "length" "16")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<mode><scalar_mode>"
[(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=&Sg")
(vec_select:<SCALAR_MODE>
(match_operand:V_4REG 1 "register_operand" " v")
(parallel [(match_operand:SI 2 "gcn_alu_operand" " SvB")])))]
""
"v_readlane_b32 %L0, %L1, %2\;v_readlane_b32 %H0, %H1, %2\;v_readlane_b32 %J0, %J1, %2\;v_readlane_b32 %K0, %K1, %2"
[(set_attr "type" "vmult")
(set_attr "length" "32")
(set_attr "exec" "none")
- (set_attr "laneselect" "yes")])
+ (set_attr "laneselect" "read")])
(define_insn "vec_extract<V_1REG:mode><V_1REG_ALT:mode>_nop"
[(set (match_operand:V_1REG_ALT 0 "register_operand" "=v,v")
(vec_select:V_1REG_ALT
(match_operand:V_1REG 1 "register_operand" " 0,v")
(match_operand 2 "ascending_zero_int_parallel" "")))]
"MODE_VF (<V_1REG_ALT:MODE>mode) < MODE_VF (<V_1REG:MODE>mode)
&& <V_1REG_ALT:SCALAR_MODE>mode == <V_1REG:SCALAR_MODE>mode
/* This comment silences a warning for operands[2]. */"
"@
; in-place extract %0
v_mov_b32\t%L0, %L1"
[(set_attr "type" "vmult")
(set_attr "length" "0,8")])
(define_insn "vec_extract<V_2REG:mode><V_2REG_ALT:mode>_nop"
[(set (match_operand:V_2REG_ALT 0 "register_operand" "=v,v")
@@ -1178,34 +1178,35 @@
&& (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
{
addr_space_t as = INTVAL (operands[3]);
const char *glc = INTVAL (operands[4]) ? TARGET_GLC_NAME : "";
static char buf[200];
if (AS_FLAT_P (as))
sprintf (buf, "flat_load%%o0\t%%0, %%1 offset:%%2%s\;s_waitcnt\t0", glc);
else if (AS_GLOBAL_P (as))
sprintf (buf, "global_load%%o0\t%%0, %%1, off offset:%%2%s\;"
"s_waitcnt\tvmcnt(0)", glc);
else
gcc_unreachable ();
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
(define_insn "gather<mode>_insn_1offset_ds<exec>"
[(set (match_operand:V_MOV 0 "register_operand" "=v,a")
(unspec:V_MOV
[(plus:<VnSI> (match_operand:<VnSI> 1 "register_operand" " v,v")
(vec_duplicate:<VnSI>
(match_operand 2 "immediate_operand" " n,n")))
(match_operand 3 "immediate_operand" " n,n")
(match_operand 4 "immediate_operand" " n,n")
(mem:BLK (scratch))]
UNSPEC_GATHER))]
"(AS_ANY_DS_P (INTVAL (operands[3]))
&& ((unsigned HOST_WIDE_INT)INTVAL(operands[2]) < 0x10000))"
{
@@ -1236,34 +1237,35 @@
UNSPEC_GATHER))]
"(AS_GLOBAL_P (INTVAL (operands[4]))
&& (((unsigned HOST_WIDE_INT)INTVAL(operands[3]) + 0x1000) < 0x2000))"
{
addr_space_t as = INTVAL (operands[4]);
const char *glc = INTVAL (operands[5]) ? TARGET_GLC_NAME : "";
static char buf[200];
if (AS_GLOBAL_P (as))
sprintf (buf, "global_load%%o0\t%%0, %%2, %%1 offset:%%3%s\;"
"s_waitcnt\tvmcnt(0)", glc);
else
gcc_unreachable ();
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "load")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2,*,cdna2")
(set_attr "xnack" "off,off,on,on")])
(define_expand "scatter_store<mode><vnsi>"
[(match_operand:DI 0 "register_operand")
(match_operand:<VnSI> 1 "register_operand")
(match_operand 2 "immediate_operand")
(match_operand:SI 3 "gcn_alu_operand")
(match_operand:V_MOV 4 "register_operand")]
""
{
rtx addr = gcn_expand_scaled_offsets (DEFAULT_ADDR_SPACE, operands[0],
operands[1], operands[3],
INTVAL (operands[2]), NULL);
if (GET_MODE (addr) == <VnDI>mode)
@@ -1321,34 +1323,35 @@
|| (AS_GLOBAL_P (INTVAL (operands[3]))
&& (((unsigned HOST_WIDE_INT)INTVAL(operands[1]) + 0x1000) < 0x2000))"
{
addr_space_t as = INTVAL (operands[3]);
const char *glc = INTVAL (operands[4]) ? TARGET_GLC_NAME : "";
static char buf[200];
if (AS_FLAT_P (as))
sprintf (buf, "flat_store%%s2\t%%0, %%2 offset:%%1%s", glc);
else if (AS_GLOBAL_P (as))
sprintf (buf, "global_store%%s2\t%%0, %%2, off offset:%%1%s", glc);
else
gcc_unreachable ();
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
(define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>"
[(set (mem:BLK (scratch))
(unspec:BLK
[(plus:<VnSI> (match_operand:<VnSI> 0 "register_operand" "v,v")
(vec_duplicate:<VnSI>
(match_operand 1 "immediate_operand" "n,n")))
(match_operand:V_MOV 2 "register_operand" "v,a")
(match_operand 3 "immediate_operand" "n,n")
(match_operand 4 "immediate_operand" "n,n")]
UNSPEC_SCATTER))]
"(AS_ANY_DS_P (INTVAL (operands[3]))
&& ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
{
addr_space_t as = INTVAL (operands[3]);
@@ -1376,34 +1379,35 @@
(match_operand 5 "immediate_operand" "n,n")]
UNSPEC_SCATTER))]
"(AS_GLOBAL_P (INTVAL (operands[4]))
&& (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
{
addr_space_t as = INTVAL (operands[4]);
const char *glc = INTVAL (operands[5]) ? TARGET_GLC_NAME : "";
static char buf[200];
if (AS_GLOBAL_P (as))
sprintf (buf, "global_store%%s3\t%%1, %%3, %%0 offset:%%2%s", glc);
else
gcc_unreachable ();
return buf;
}
[(set_attr "type" "flat")
+ (set_attr "flatmemaccess" "store")
(set_attr "length" "12")
(set_attr "cdna" "*,cdna2")])
;; }}}
;; {{{ Permutations
(define_insn "ds_bpermute<mode>"
[(set (match_operand:V_1REG 0 "register_operand" "=v")
(unspec:V_1REG
[(match_operand:V_1REG 2 "register_operand" " v")
(match_operand:<VnSI> 1 "register_operand" " v")
(match_operand:DI 3 "gcn_exec_reg_operand" " e")]
UNSPEC_BPERMUTE))]
""
"ds_bpermute_b32\t%0, %1, %2\;s_waitcnt\tlgkmcnt(0)"
[(set_attr "type" "vop2")
(set_attr "length" "12")])
@@ -3246,44 +3250,46 @@
(define_insn "neg<mode>2<exec>"
[(set (match_operand:V_FP 0 "register_operand" "=v")
(neg:V_FP
(match_operand:V_FP 1 "register_operand" " v")))]
""
"v_add%i0\t%0, 0, -%1"
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
(define_insn "sqrt<mode>2<exec>"
[(set (match_operand:V_FP 0 "register_operand" "= v")
(sqrt:V_FP
(match_operand:V_FP 1 "gcn_alu_operand" "vSvB")))]
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
(define_insn "sqrt<mode>2"
[(set (match_operand:FP 0 "register_operand" "= v")
(sqrt:FP
(match_operand:FP 1 "gcn_alu_operand" "vSvB")))]
"flag_unsafe_math_optimizations"
"v_sqrt%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
; These FP unops have f64, f32 and f16 versions.
(define_int_iterator MATH_UNOP_1OR2REG
[UNSPEC_FLOOR UNSPEC_CEIL])
; These FP unops only have f16/f32 versions.
(define_int_iterator MATH_UNOP_1REG
[UNSPEC_EXP2 UNSPEC_LOG2])
(define_int_iterator MATH_UNOP_TRIG
[UNSPEC_SIN UNSPEC_COS])
(define_int_attr math_unop
[(UNSPEC_FLOOR "floor")
(UNSPEC_CEIL "ceil")
(UNSPEC_EXP2 "exp2")
(UNSPEC_LOG2 "log2")
@@ -3545,35 +3551,36 @@
(match_operand:FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
""
"v_fma%i0\t%0, %1, -%2, -%3"
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
;; }}}
;; {{{ FP division
(define_insn "recip<mode>2<exec>"
[(set (match_operand:SV_FP 0 "register_operand" "= v")
(unspec:SV_FP
[(match_operand:SV_FP 1 "gcn_alu_operand" "vSvB")]
UNSPEC_RCP))]
""
"v_rcp%i0\t%0, %1"
[(set_attr "type" "vop1")
- (set_attr "length" "8")])
+ (set_attr "length" "8")
+ (set_attr "transop" "yes")])
;; v_div_scale takes a numerator (op2) and denominator (op1) and returns the
;; one that matches op3 adjusted for best results in reciprocal division.
;; It also emits a VCC mask that is intended for input to v_div_fmas.
;; The caller is expected to call this twice, once for each input. The output
;; VCC is the same in both cases, so the caller may discard one.
(define_insn "div_scale<mode><exec_vcc>"
[(set (match_operand:SV_SFDF 0 "register_operand" "=v")
(unspec:SV_SFDF
[(match_operand:SV_SFDF 1 "gcn_alu_operand" "v")
(match_operand:SV_SFDF 2 "gcn_alu_operand" "v")
(match_operand:SV_SFDF 3 "gcn_alu_operand" "v")]
UNSPEC_DIV_SCALE))
(set (match_operand:DI 4 "register_operand" "=SvcV")
(unspec:DI
[(match_dup 1) (match_dup 2) (match_dup 3)]
UNSPEC_DIV_SCALE))]
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 6cd17d91f8a..8959118b869 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5778,34 +5778,70 @@ gcn_vectorize_builtin_vectorized_function (unsigned int fn, tree type_out,
TREE_READONLY (new_fndecl) = 1;
return new_fndecl;
}
/* Implement TARGET_LIBC_HAS_FUNCTION. */
bool
gcn_libc_has_function (enum function_class fn_class,
tree type)
{
return bsd_libc_has_function (fn_class, type);
}
/* }}} */
/* {{{ md_reorg pass. */
+/* Identify V_CMPX from the "type" attribute;
+ note: this will also match 'v_cmp %E1 vcc'. */
+
+static bool
+gcn_cmpx_insn_p (attr_type type)
+{
+ switch (type)
+ {
+ case TYPE_VOPC:
+ return true;
+ case TYPE_MUBUF:
+ case TYPE_MTBUF:
+ case TYPE_FLAT:
+ case TYPE_VOP3P_MAI:
+ case TYPE_UNKNOWN:
+ case TYPE_SOP1:
+ case TYPE_SOP2:
+ case TYPE_SOPK:
+ case TYPE_SOPC:
+ case TYPE_SOPP:
+ case TYPE_SMEM:
+ case TYPE_DS:
+ case TYPE_VOP2:
+ case TYPE_VOP1:
+ case TYPE_VOP3A:
+ case TYPE_VOP3B:
+ case TYPE_VOP_SDWA:
+ case TYPE_VOP_DPP:
+ case TYPE_MULT:
+ case TYPE_VMULT:
+ return false;
+ }
+ gcc_unreachable ();
+ return false;
+}
+
/* Identify VMEM instructions from their "type" attribute. */
static bool
gcn_vmem_insn_p (attr_type type)
{
switch (type)
{
case TYPE_MUBUF:
case TYPE_MTBUF:
case TYPE_FLAT:
case TYPE_VOP3P_MAI:
return true;
case TYPE_UNKNOWN:
case TYPE_SOP1:
case TYPE_SOP2:
case TYPE_SOPK:
case TYPE_SOPC:
@@ -6138,61 +6174,93 @@ gcn_md_reorg (void)
if (NONJUMP_INSN_P (end_insn))
end_insn = NEXT_INSN (end_insn);
gcn_restore_exec (end_insn, last_exec_def, curr_exec,
curr_exec_known, last_exec_def_saved);
}
}
CLEAR_REG_SET (&live);
/* "Manually Inserted Wait States (NOPs)."
GCN hardware detects most kinds of register dependencies, but there
are some exceptions documented in the ISA manual. This pass
detects the missed cases, and inserts the documented number of NOPs
required for correct execution. */
+ /* RDNA4 (not yet implemented) differs from RNDA 2/3/3.5 and requires some
+ s_nop, see 5.7 and esp. 5.7.2. in its ISA manual.
+ The assert here is a reminder to add those. */
+ STATIC_ASSERT (ISA_CDNA1 - ISA_RDNA3 == 1);
+
+ if (TARGET_NO_MANUAL_NOPS)
+ return;
+
const int max_waits = 5;
struct ilist
{
rtx_insn *insn;
attr_unit unit;
- attr_delayeduse delayeduse;
+ attr_type type;
+ attr_flatmemaccess flatmemaccess;
+ bool delayeduse;
HARD_REG_SET writes;
HARD_REG_SET reads;
int age;
} back[max_waits];
int oldest = 0;
for (int i = 0; i < max_waits; i++)
back[i].insn = NULL;
rtx_insn *insn, *last_insn = NULL;
for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn))
{
if (!NONDEBUG_INSN_P (insn))
continue;
if (GET_CODE (PATTERN (insn)) == USE
|| GET_CODE (PATTERN (insn)) == CLOBBER)
continue;
attr_type itype = get_attr_type (insn);
attr_unit iunit = get_attr_unit (insn);
- attr_delayeduse idelayeduse = get_attr_delayeduse (insn);
+ attr_flatmemaccess iflatmemaccess = get_attr_flatmemaccess (insn);
+ bool delayeduse;
+ if (TARGET_CDNA3_NOPS)
+ switch (iflatmemaccess)
+ {
+ case FLATMEMACCESS_STORE:
+ case FLATMEMACCESS_STOREX34:
+ case FLATMEMACCESS_ATOMIC:
+ case FLATMEMACCESS_CMPSWAPX2:
+ delayeduse = true;
+ break;
+ case FLATMEMACCESS_LOAD:
+ case FLATMEMACCESS_ATOMICWAIT:
+ case FLATMEMACCESS_NO:
+ delayeduse = false;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ else
+ delayeduse = (iflatmemaccess == FLATMEMACCESS_CMPSWAPX2
+ || iflatmemaccess == FLATMEMACCESS_STOREX34);
+
int ivccwait = get_attr_vccwait (insn);
HARD_REG_SET ireads, iwrites;
CLEAR_HARD_REG_SET (ireads);
CLEAR_HARD_REG_SET (iwrites);
note_stores (insn, record_hard_reg_sets, &iwrites);
note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads);
/* Scan recent previous instructions for dependencies not handled in
hardware. */
int nops_rqd = 0;
for (int i = oldest; i < oldest + max_waits; i++)
{
struct ilist *prev_insn = &back[i % max_waits];
if (!prev_insn->insn)
continue;
@@ -6209,72 +6277,148 @@ gcn_md_reorg (void)
/* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
requires 5 wait states. */
if ((prev_insn->age + nops_rqd) < 5
&& prev_insn->unit == UNIT_VECTOR
&& iunit == UNIT_VECTOR
&& ((hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int) EXEC_MASK_REG])
&& TEST_HARD_REG_BIT (ireads, EXECZ_REG))
||
(hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int) VCC_CONDITIONAL_REG])
&& TEST_HARD_REG_BIT (ireads, VCCZ_REG))))
nops_rqd = 5 - prev_insn->age;
- /* VALU writes SGPR/VCC followed by v_{read,write}lane using
- SGPR/VCC as lane select requires 4 wait states. */
+ /* VALU writes SGPR/VCC followed by
+ - v_{read,write}lane using SGPR/VCC as lane select requires
+ 4 wait states
+ - [CDNA3] VALU reads SGPR as constant requires 1 wait state
+ - [CDNA3] VALU reads SGPR as carry-in requires no wait states */
if ((prev_insn->age + nops_rqd) < 4
&& prev_insn->unit == UNIT_VECTOR
- && get_attr_laneselect (insn) == LANESELECT_YES
+ && get_attr_laneselect (insn) != LANESELECT_NO
&& (hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) SGPR_REGS])
|| hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG])))
nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
requires 2 wait states. */
if ((prev_insn->age + nops_rqd) < 2
&& prev_insn->unit == UNIT_VECTOR
&& itype == TYPE_VOP_DPP)
{
if (hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) VGPR_REGS]))
nops_rqd = 2 - prev_insn->age;
}
+ /* VALU writes EXEC followed by VALU DPP op requires 5 nop. */
+ if ((prev_insn->age + nops_rqd) < 5
+ && itype == TYPE_VOP_DPP
+ && prev_insn->unit == UNIT_VECTOR
+ && TEST_HARD_REG_BIT (prev_insn->writes, EXECZ_REG))
+ nops_rqd = 5 - prev_insn->age;
+
/* Store that requires input registers are not overwritten by
- following instruction. */
- if ((prev_insn->age + nops_rqd) < 1
- && prev_insn->delayeduse == DELAYEDUSE_YES
+ following instruction.
+ For CDNA3, only, VALU writes require 2 not 1 nop.
+ CDNA3 additionally requires that 1 or 2 nop for global & scatch
+ store/atomic. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && prev_insn->delayeduse
+ && iunit == UNIT_VECTOR
+ && ((hard_reg_set_intersect_p
+ (prev_insn->reads, iwrites))))
+ nops_rqd = 2 - prev_insn->age;
+ else if ((prev_insn->age + nops_rqd) < 1
+ && prev_insn->delayeduse
&& ((hard_reg_set_intersect_p
(prev_insn->reads, iwrites))))
nops_rqd = 1 - prev_insn->age;
- /* Instruction that requires VCC is not written too close before
- using it. */
+ /* Instruction (such as v_div_fmas) that requires VCC is not written
+ too close before using it */
if (prev_insn->age < ivccwait
&& (hard_reg_set_intersect_p
(prev_insn->writes,
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+ /* CDNA3: v_cmpx followed by
+ - V_readlane, v_readfirstlane, v_writelane requires 4 wait states
+ - VALU reads EXEC as constant requires 2 wait states
+ - other VALU requires no wait state */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 4
+ && gcn_cmpx_insn_p (prev_insn->type)
+ && get_attr_laneselect (insn) != LANESELECT_NO)
+ nops_rqd = 4 - prev_insn->age;
+ else if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 2
+ && iunit == UNIT_VECTOR
+ && gcn_cmpx_insn_p (prev_insn->type)
+ && TEST_HARD_REG_BIT (ireads, EXECZ_REG))
+ nops_rqd = 2 - prev_insn->age;
+
+ /* CDNA3: VALU writes VGPR followed by v_readlane vsrc0 reads VGPRn
+ requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->flatmemaccess != FLATMEMACCESS_LOAD
+ && get_attr_laneselect (insn) == LANESELECT_READ
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CDNA3: VALU op which uses OPSEL or SDWA with changes the result's
+ bit position followed by VALU op consumes result of that op
+ requires 1 wait state.
+ FIXME: Handle OPSEL, once used. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && prev_insn->type == TYPE_VOP_SDWA
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
+ /* CNDA3: VALU Trans Op (such as v_rcp_f64) followed by non-trans VALU
+ op consumes result of that op requires 1 wait state. */
+ if (TARGET_CDNA3_NOPS
+ && (prev_insn->age + nops_rqd) < 1
+ && prev_insn->unit == UNIT_VECTOR
+ && iunit == UNIT_VECTOR
+ && get_attr_transop (prev_insn->insn) == TRANSOP_YES
+ && get_attr_transop (insn) == TRANSOP_NO
+ && !hard_reg_set_empty_p (depregs))
+ nops_rqd = 1 - prev_insn->age;
+
/* CDNA1: write VGPR before v_accvgpr_write reads it. */
if (TARGET_AVGPR_CDNA1_NOPS
&& (prev_insn->age + nops_rqd) < 2
&& hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) VGPR_REGS])
&& hard_reg_set_intersect_p
(iwrites, reg_class_contents[(int) AVGPR_REGS]))
nops_rqd = 2 - prev_insn->age;
/* CDNA1: v_accvgpr_write writes AVGPR before v_accvgpr_read. */
if (TARGET_AVGPR_CDNA1_NOPS
&& (prev_insn->age + nops_rqd) < 3
&& hard_reg_set_intersect_p
(depregs, reg_class_contents[(int) AVGPR_REGS])
&& hard_reg_set_intersect_p
(iwrites, reg_class_contents[(int) VGPR_REGS]))
nops_rqd = 3 - prev_insn->age;
@@ -6302,35 +6446,37 @@ gcn_md_reorg (void)
for (int i = oldest + max_waits - 1; i > oldest; i--)
{
struct ilist *prev_insn = &back[i % max_waits];
/* Assume all instructions are equivalent to one "wait", the same
as s_nop. This is probably true for SALU, but not VALU (which
may take longer), so this is not optimal. However, AMD do
not publish the cycle times for instructions. */
prev_insn->age += 1 + nops_rqd;
written |= iwrites;
prev_insn->writes &= ~written;
}
/* Track the current instruction as a previous instruction. */
back[oldest].insn = insn;
back[oldest].unit = iunit;
- back[oldest].delayeduse = idelayeduse;
+ back[oldest].type = itype;
+ back[oldest].flatmemaccess = iflatmemaccess;
+ back[oldest].delayeduse = delayeduse;
back[oldest].writes = iwrites;
back[oldest].reads = ireads;
back[oldest].age = 0;
oldest = (oldest + 1) % max_waits;
last_insn = insn;
}
}
/* }}} */
/* {{{ OpenACC / OpenMP. */
#define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */
#define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */
#define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */
/* Implement TARGET_GOACC_VALIDATE_DIMS.
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 9193461ed49..fad42e6a6bf 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -298,46 +298,56 @@
(and (eq_attr "rdna" "yes")
(eq (symbol_ref "TARGET_RDNA2_PLUS") (const_int 0)))
(const_int 0)
(and (eq_attr "cdna" "cdna2")
(eq (symbol_ref "TARGET_CDNA2_PLUS") (const_int 0)))
(const_int 0)
(and (eq_attr "xnack" "off")
(ne (symbol_ref "TARGET_XNACK") (const_int 0)))
(const_int 0)
(and (eq_attr "xnack" "on")
(eq (symbol_ref "TARGET_XNACK") (const_int 0)))
(const_int 0)]
(const_int 1)))
; We need to be able to identify v_readlane and v_writelane with
; SGPR lane selection in order to handle "Manually Inserted Wait States".
-(define_attr "laneselect" "yes,no" (const_string "no"))
+(define_attr "laneselect" "write,read,no" (const_string "no"))
-; Identify instructions that require a "Manually Inserted Wait State" if
-; their inputs are overwritten by subsequent instructions.
+; Global or flat memory access using store or load followed by waitcnt
+; and using flat/global atomic access, possibly followed by a waitcnt.
+; 'storex34' denotes FLAT_STORE_X{3,4}.
+; 'cmpswapx2' denotes FLAT_ATOMIC_{F}CMPSWAP_X2
+; Used to handle "Manually Inserted Wait State".
-(define_attr "delayeduse" "yes,no" (const_string "no"))
+(define_attr "flatmemaccess"
+ "store,storex34,load,atomic,atomicwait,cmpswapx2,no"
+ (const_string "no"))
; Identify instructions that require "Manually Inserted Wait State" if
; a previous instruction writes to VCC. The number gives the number of NOPs.
(define_attr "vccwait" "" (const_int 0))
+; Mark trans ops such as v_{exp,rsq,sqrt,sin,cos,log,...}_F{16,32,64}
+; for later conditional s_nop insertion.
+
+(define_attr "transop" "yes,no" (const_string "no"))
+
;; }}}
;; {{{ Iterators useful across the wole machine description
(define_mode_iterator SIDI [SI DI])
(define_mode_iterator SFDF [SF DF])
(define_mode_iterator SISF [SI SF])
(define_mode_iterator QIHI [QI HI])
(define_mode_iterator DIDF [DI DF])
(define_mode_iterator FP [HF SF DF])
(define_mode_iterator FP_1REG [HF SF])
;; }}}
;; {{{ Attributes.
; Translate RTX code into GCN instruction mnemonics with and without
; suffixes such as _b32, etc.
@@ -541,146 +551,148 @@
return "s_store_dword\t%1, %A0";
case 9:
case 10:
return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
case 11:
return "flat_store_dword\t%A0, %1%O0%g0";
case 12:
case 13:
return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
case 14:
return "global_store_dword\t%A0, %1%O0%g0";
default:
gcc_unreachable ();
}
}
[(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
flat,flat,flat,flat")
+ (set_attr "flatmemaccess" "*,*,*,*,*,*,*,*,*,load,load,store,load,load,store")
(set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
(set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
- (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")])
+ (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")
+ (set_attr "laneselect" "*,*,read,*,*,*,*,*,*,*,*,*,*,*,*")])
; 32bit move pattern
(define_insn "*mov<mode>_insn"
[(set (match_operand:SISF 0 "nonimmediate_operand")
(match_operand:SISF 1 "gcn_load_operand"))]
""
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,RB ;smem ,* ,12,* ,off] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
- [&SD ,RB ;smem ,* ,12,* ,on ] ^
- [RB ,Sm ;smem ,* ,12,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
- [Sm ,RS ;smem ,* ,12,* ,off] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,* ,12,* ,on ] ^
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dword\t%1, %A0
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,8 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,8 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,4,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dword\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [SD ,Y ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,RM ;flat ,* ,12,* ,off] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store_dword\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,RB ;smem ,* ,12,* ,off,* ,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ [&SD ,RB ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RB ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_buffer_store%s1\t%1, s[0:3], %0
+ [Sm ,RS ;smem ,* ,12,* ,off,* ,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,* ,12,* ,on ,* ,* ] ^
+ [RS ,Sm ;smem ,* ,12,* ,* ,* ,* ] s_store_dword\t%1, %A0
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,8 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,8 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,4,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store_dword\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [SD ,Y ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store_dword\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 8/16bit move pattern
; TODO: implement combined load and zero_extend, but *only* for -msram-ecc=on
(define_insn "*mov<mode>_insn"
[(set (match_operand:QIHI 0 "nonimmediate_operand")
(match_operand:QIHI 1 "gcn_load_operand"))]
"gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
- {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack]
- [SD ,SSA ;sop1 ,* ,4 ,* ,* ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ,* ,* ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ,* ,* ] s_mov_b32\t%0, %1
- [v ,v ;vop1 ,* ,4 ,* ,* ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,4 ,* ,* ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,4 ,* ,* ] v_writelane_b32\t%0, %1, 0
- [v ,^a ;vop3p_mai,*,8,* ,* ] v_accvgpr_read_b32\t%0, %1
- [a ,v ;vop3p_mai,*,8,* ,* ] v_accvgpr_write_b32\t%0, %1
- [a ,a ;vop1 ,* ,8,cdna2,* ] v_accvgpr_mov_b32\t%0, %1
- [v ,RF ;flat ,* ,12,* ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a ,RF ;flat ,* ,12,cdna2,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store%s0\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,B ;vop1 ,* ,8 ,* ,* ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,* ,12,* ,off] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a ,RM ;flat ,* ,12,cdna2,on ] ^
- [RM ,v ;flat ,* ,12,* ,* ] global_store%s0\t%A0, %1%O0%g0
- [RM ,a ;flat ,* ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, exec, length, cdna, xnack, laneselect, flatmemaccess]
+ [SD ,SSA ;sop1 ,* ,4 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ,* ,* ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ,* ,* ,* ] s_mov_b32\t%0, %1
+ [v ,v ;vop1 ,* ,4 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,4 ,* ,* ,read ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,4 ,* ,* ,write,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ,* ,* ,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,8,cdna2,* ,* ,* ] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ,off,* ,load ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RF ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RF ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RF ,v ;flat ,* ,12,* ,* ,* ,store] flat_store%s0\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
+ [v ,B ;vop1 ,* ,8 ,* ,* ,* ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ,* ,* ,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ,* ,* ,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,* ,12,* ,off,* ,load ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,* ,12,* ,on ,* ,load ] ^
+ [^a ,RM ;flat ,* ,12,cdna2,off,* ,load ] ^
+ [&^a ,RM ;flat ,* ,12,cdna2,on ,* ,load ] ^
+ [RM ,v ;flat ,* ,12,* ,* ,* ,store] global_store%s0\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2,* ,* ,store] ^
})
; 64bit move pattern
(define_insn_and_split "*mov<mode>_insn"
[(set (match_operand:DIDF 0 "nonimmediate_operand")
(match_operand:DIDF 1 "general_operand"))]
"GET_CODE(operands[1]) != SYMBOL_REF"
- {@ [cons: =0, 1; attrs: type, length, cdna, xnack]
- [SD ,SSA ;sop1 ,4 ,* ,* ] s_mov_b64\t%0, %1
- [SD ,C ;sop1 ,8 ,* ,* ] ^
- [SD ,DB ;mult ,* ,* ,* ] #
- [RS ,Sm ;smem ,12,* ,* ] s_store_dwordx2\t%1, %A0
- [Sm ,RS ;smem ,12,* ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm ,RS ;smem ,12,* ,on ] ^
- [v ,v ;vmult,* ,* ,* ] #
- [v ,DB ;vmult,* ,* ,* ] #
- [Sg ,v ;vmult,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ] #
- [v ,^a ;vmult,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ] #
- [a ,a ;vmult,* ,cdna2,* ] #
- [v ,RF ;flat ,12,* ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,12,* ,on ] ^
- [^a ,RF ;flat ,12,cdna2,off] ^
- [&^a ,RF ;flat ,12,cdna2,on ] ^
- [RF ,v ;flat ,12,* ,* ] flat_store_dwordx2\t%A0, %1%O0%g0
- [RF ,a ;flat ,12,cdna2,* ] ^
- [RLRG,v ;ds ,12,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,12,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,12,* ,off] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,12,* ,on ] ^
- [^a ,RM ;flat ,12,cdna2,off] ^
- [&^a ,RM ;flat ,12,cdna2,on ] ^
- [RM ,v ;flat ,12,* ,* ] global_store_dwordx2\t%A0, %1%O0%g0
- [RM ,a ;flat ,12,cdna2,* ] ^
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSA ;sop1 ,4 ,* ,* ,* ] s_mov_b64\t%0, %1
+ [SD ,C ;sop1 ,8 ,* ,* ,* ] ^
+ [SD ,DB ;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx2\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm ,RS ;smem ,12,* ,on ,* ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,DB ;vmult,* ,* ,* ,* ] #
+ [Sg ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RF ;flat ,12,cdna2,on ,load ] ^
+ [RF ,v ;flat ,12,* ,* ,store] flat_store_dwordx2\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,store] ^
+ [RLRG,v ;ds ,12,* ,* ,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,12,* ,* ,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a ,RM ;flat ,12,cdna2,on ,load ] ^
+ [RM ,v ;flat ,12,* ,* ,store] global_store_dwordx2\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,store] ^
}
"reload_completed
&& ((!MEM_P (operands[0]) && !MEM_P (operands[1])
&& !gcn_sgpr_move_p (operands[0], operands[1]))
|| (GET_CODE (operands[1]) == CONST_INT
&& !gcn_constant64_p (operands[1])))"
[(set (match_dup 0) (match_dup 1))
(set (match_dup 2) (match_dup 3))]
{
rtx inlo = gen_lowpart (SImode, operands[1]);
rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
rtx outlo = gen_lowpart (SImode, operands[0]);
rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
/* Ensure that overlapping registers aren't corrupted. */
if (reg_overlap_mentioned_p (outlo, inhi))
{
@@ -690,59 +702,59 @@
operands[3] = inlo;
}
else
{
operands[0] = outlo;
operands[1] = inlo;
operands[2] = outhi;
operands[3] = inhi;
}
})
; 128-bit move.
(define_insn_and_split "*movti_insn"
[(set (match_operand:TI 0 "nonimmediate_operand")
(match_operand:TI 1 "general_operand" ))]
""
- {@ [cons: =0, 1; attrs: type, delayeduse, length, cdna, xnack]
- [SD ,SSB;mult ,* ,* ,* ,* ] #
- [RS ,Sm ;smem ,* ,12,* ,* ] s_store_dwordx4\t%1, %A0
- [Sm ,RS ;smem ,yes,12,* ,off] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [&Sm,RS ;smem ,yes,12,* ,on ] ^
- [RF ,v ;flat ,* ,12,* ,* ] flat_store_dwordx4\t%A0, %1%O0%g0
- [RF ,a ;flat ,* ,12,cdna2,* ] ^
- [v ,RF ;flat ,* ,12,* ,off] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
- [&v ,RF ;flat ,* ,12,* ,on ] ^
- [^a ,RF ;flat ,* ,12,cdna2,off] ^
- [&^a,RF ;flat ,* ,12,cdna2,on ] ^
- [v ,v ;vmult,* ,* ,* ,* ] #
- [v ,Sv ;vmult,* ,* ,* ,* ] #
- [SD ,v ;vmult,* ,* ,* ,* ] #
- [RM ,v ;flat ,yes,12,* ,* ] global_store_dwordx4\t%A0, %1%O0%g0
- [RM ,a ;flat ,yes,12,cdna2,* ] ^
- [v ,RM ;flat ,* ,12,* ,off] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [&v ,RM ;flat ,* ,12,* ,on ] ^
- [^a ,RM ;flat ,* ,12,cdna2,off] ^
- [&^a,RM ;flat ,* ,12,cdna2,on ] ^
- [RL ,v ;ds ,* ,12,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RL ;ds ,* ,12,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,^a ;vmult,* ,* ,* ,* ] #
- [a ,v ;vmult,* ,* ,* ,* ] #
- [a ,a ;vmult,* ,* ,cdna2,* ] #
+ {@ [cons: =0, 1; attrs: type, length, cdna, xnack, flatmemaccess]
+ [SD ,SSB;mult ,* ,* ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ,* ,* ] s_store_dwordx4\t%1, %A0
+ [Sm ,RS ;smem ,12,* ,off,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [&Sm,RS ;smem ,12,* ,on ,* ] ^
+ [RF ,v ;flat ,12,* ,* ,storex34] flat_store_dwordx4\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RF ;flat ,12,* ,off,load ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [&v ,RF ;flat ,12,* ,on ,load ] ^
+ [^a ,RF ;flat ,12,cdna2,off,load ] ^
+ [&^a,RF ;flat ,12,cdna2,on ,load ] ^
+ [v ,v ;vmult,* ,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ,* ] #
+ [SD ,v ;vmult,* ,* ,* ,* ] #
+ [RM ,v ;flat ,12,* ,* ,storex34] global_store_dwordx4\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2,* ,storex34] ^
+ [v ,RM ;flat ,12,* ,off,load ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [&v ,RM ;flat ,12,* ,on ,load ] ^
+ [^a ,RM ;flat ,12,cdna2,off,load ] ^
+ [&^a,RM ;flat ,12,cdna2,on ,load ] ^
+ [RL ,v ;ds ,12,* ,* ,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RL ;ds ,12,* ,* ,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,^a ;vmult,* ,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ,* ] #
+ [a ,a ;vmult,* ,cdna2,* ,* ] #
}
"reload_completed
&& REG_P (operands[0])
&& (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
[(set (match_dup 0) (match_dup 1))
(set (match_dup 2) (match_dup 3))
(set (match_dup 4) (match_dup 5))
(set (match_dup 6) (match_dup 7))]
{
gcc_assert (rtx_equal_p (operands[0], operands[1])
|| !reg_overlap_mentioned_p (operands[0], operands[1]));
operands[6] = gcn_operand_part (TImode, operands[0], 3);
operands[7] = gcn_operand_part (TImode, operands[1], 3);
operands[4] = gcn_operand_part (TImode, operands[0], 2);
operands[5] = gcn_operand_part (TImode, operands[1], 2);
operands[2] = gcn_operand_part (TImode, operands[0], 1);
operands[3] = gcn_operand_part (TImode, operands[1], 1);
@@ -1971,55 +1983,57 @@
; TODO: flush caches according to memory model
(define_insn "atomic_fetch_<bare_mnemonic><mode>"
[(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
(match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
(set (match_dup 1)
(unspec_volatile:SIDI
[(atomicops:SIDI
(match_dup 1)
(match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
UNSPECV_ATOMIC))
(use (match_operand 3 "const_int_operand"))]
"0 /* Disabled. */"
"@
s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 %G2\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
; FIXME: These patterns are disabled because the instructions don't
; seem to work as advertised. Specifically, OMP "team distribute"
; reductions apparently "lose" some of the writes, similar to what
; you might expect from a concurrent non-atomic read-modify-write.
; TODO: flush caches according to memory model
(define_insn "atomic_<bare_mnemonic><mode>"
[(set (match_operand:SIDI 0 "memory_operand" "+RS,RF,RM")
(unspec_volatile:SIDI
[(atomicops:SIDI
(match_dup 0)
(match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
UNSPECV_ATOMIC))
(use (match_operand 2 "const_int_operand"))]
"0 /* Disabled. */"
"@
s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "12")])
(define_mode_attr x2 [(SI "DI") (DI "TI")])
(define_mode_attr size [(SI "4") (DI "8")])
(define_mode_attr bitsize [(SI "32") (DI "64")])
(define_expand "sync_compare_and_swap<mode>"
[(match_operand:SIDI 0 "register_operand")
(match_operand:SIDI 1 "memory_operand")
(match_operand:SIDI 2 "register_operand")
(match_operand:SIDI 3 "register_operand")]
""
{
if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
{
emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
operands[1],
@@ -2039,35 +2053,35 @@
DONE;
})
(define_insn "sync_compare_and_swap<mode>_insn"
[(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
(match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
(set (match_dup 1)
(unspec_volatile:SIDI
[(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
UNSPECV_ATOMIC))]
""
"@
s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
flat_atomic_cmpswap<X>\t%0, %1, %2 %G2\;s_waitcnt\t0
global_atomic_cmpswap<X>\t%0, %A1, %2%O1 %G2\;s_waitcnt\tvmcnt(0)"
[(set_attr "type" "smem,flat,flat")
(set_attr "length" "12")
- (set_attr "delayeduse" "*,yes,yes")])
+ (set_attr "flatmemaccess" "*,cmpswapx2,cmpswapx2")])
(define_insn "sync_compare_and_swap<mode>_lds_insn"
[(set (match_operand:SIDI 0 "register_operand" "= v")
(unspec_volatile:SIDI
[(match_operand:SIDI 1 "memory_operand" "+RL")]
UNSPECV_ATOMIC))
(set (match_dup 1)
(unspec_volatile:SIDI
[(match_operand:SIDI 2 "register_operand" " v")
(match_operand:SIDI 3 "register_operand" " v")]
UNSPECV_ATOMIC))]
""
{
if (TARGET_RDNA3)
return "ds_cmpstore_rtn_b<bitsize> %0, %1, %3, %2\;s_waitcnt\tlgkmcnt(0)";
else
return "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)";
@@ -2159,34 +2173,35 @@
return (TARGET_RDNA2
? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1 dlc\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_RDNA3
? "buffer_gl1_inv\;buffer_gl0_inv\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_TARGET_SC_CACHE
? "buffer_inv sc1\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol");
}
break;
}
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,load,load")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
(define_insn "atomic_store<mode>"
[(set (match_operand:SIDI 0 "memory_operand" "=RS,RF,RM")
(unspec_volatile:SIDI
[(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
UNSPECV_ATOMIC))
(use (match_operand:SIDI 2 "immediate_operand" " i, i, i"))]
""
{
switch (INTVAL (operands[2]))
{
case MEMMODEL_RELAXED:
switch (which_alternative)
{
case 0:
@@ -2243,34 +2258,35 @@
case 2:
return (TARGET_GLn_CACHE
? "buffer_gl1_inv\;buffer_gl0_inv\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
? "buffer_inv sc1\;global_store%o1\t%A0, %1%O0 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
}
break;
}
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,store,store")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
(define_insn "atomic_exchange<mode>"
[(set (match_operand:SIDI 0 "register_operand" "=Sm, v, v")
(match_operand:SIDI 1 "memory_operand" "+RS,RF,RM"))
(set (match_dup 1)
(unspec_volatile:SIDI
[(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
UNSPECV_ATOMIC))
(use (match_operand 3 "immediate_operand"))]
""
{
switch (INTVAL (operands[3]))
{
case MEMMODEL_RELAXED:
switch (which_alternative)
@@ -2375,34 +2391,35 @@
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_gl1_inv\;buffer_gl0_inv"
: TARGET_WBINVL1_CACHE
? "buffer_wbinvl1_vol\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol"
: TARGET_TARGET_SC_CACHE
? "buffer_inv sc1\;"
"global_atomic_swap<X>\t%0, %A1, %2%O1 %G1\;"
"s_waitcnt\tvmcnt(0)\;buffer_inv sc1"
: "error: cache architecture unspecified");
}
break;
}
gcc_unreachable ();
}
[(set_attr "type" "smem,flat,flat")
+ (set_attr "flatmemaccess" "*,atomicwait,atomicwait")
(set_attr "length" "28")
(set_attr "rdna" "no,*,*")])
;; }}}
;; {{{ OpenACC / OpenMP
(define_expand "oacc_dim_size"
[(match_operand:SI 0 "register_operand")
(match_operand:SI 1 "const_int_operand")]
""
{
rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
DONE;
})
(define_expand "oacc_dim_pos"