Some AMD GCN devices support an "XNACK" mode in which the device can handle page-misses (and maybe other traps in memory instructions), but it's not completely invisible to software.

We need this now to support OpenMP Unified Shared Memory (I plan to post updated patches for that in January), and in future it may enable support for APU devices (such as MI300).

The first patch ensures that load instructions are "restartable", meaning that the outputs do not overwrite the input registers (address and offsets). This maps pretty much exactly to the GCC "early-clobber" concept, so we just need to add additional alternatives and then not generate problem instructions explicitly.

The second patch is a workaround for the register allocation patch I asked about on gcc@ yesterday. The early clobber increases register pressure which causes compile failure when LRA is unable to spill additional registers without needing yet more registers. This doesn't become a problem on gfx90a (MI200) so soon due to the additional AVGPR spill registers, and that's the only device that really supports USM, so far, so limiting XNACK to that device will work for now.

The -mxnack option was already added as a placeholder, so not much is needed there.

Committed to master. An older version of these patches is already committed to devel/omp/gcc-13 (OG13).

Andrew
amdgcn: Work around XNACK register allocation problem

The extra register pressure is causing infinite loops in some cases, especially
at -O0.  I have not yet observed any issue on devices that have AVGPRs for
spilling, and XNACK is only really useful on those devices anyway, so change
the defaults.

gcc/ChangeLog:

        * config/gcn/gcn-hsa.h (NO_XNACK): Change the defaults.
        * config/gcn/gcn-opts.h (enum hsaco_attr_type): Add HSACO_ATTR_DEFAULT.
        * config/gcn/gcn.cc (gcn_option_override): Set the default flag_xnack.
        * config/gcn/gcn.opt: Add -mxnack=default.
        * doc/invoke.texi: Document the -mxnack default.

diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index bfb104526c5..b44d42b02d6 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -75,7 +75,9 @@ extern unsigned int gcn_local_sym_hash (const char *name);
    supported for gcn.  */
 #define GOMP_SELF_SPECS ""
 
-#define NO_XNACK "march=fiji:;march=gfx1030:;"
+#define NO_XNACK "march=fiji:;march=gfx1030:;" \
+    /* These match the defaults set in gcn.cc.  */ \
+    
"!mxnack*|mxnack=default:%{march=gfx900|march=gfx906|march=gfx908:-mattr=-xnack};"
 #define NO_SRAM_ECC "!march=*:;march=fiji:;march=gfx900:;march=gfx906:;"
 
 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h
index b4f494d868c..634cec6d832 100644
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@@ -65,7 +65,8 @@ enum hsaco_attr_type
 {
   HSACO_ATTR_OFF,
   HSACO_ATTR_ON,
-  HSACO_ATTR_ANY
+  HSACO_ATTR_ANY,
+  HSACO_ATTR_DEFAULT
 };
 
 #endif
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index d92cd01d03f..b67551a2e8e 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -172,6 +172,29 @@ gcn_option_override (void)
       /* Allow HSACO_ATTR_ANY silently because that's the default.  */
       flag_xnack = HSACO_ATTR_OFF;
     }
+
+  /* There's no need for XNACK on devices without USM, and there are register
+     allocation problems caused by the early-clobber when AVGPR spills are not
+     available.
+     FIXME: can the regalloc mean the default can be really "any"?  */
+  if (flag_xnack == HSACO_ATTR_DEFAULT)
+    switch (gcn_arch)
+      {
+      case PROCESSOR_FIJI:
+      case PROCESSOR_VEGA10:
+      case PROCESSOR_VEGA20:
+      case PROCESSOR_GFX908:
+       flag_xnack = HSACO_ATTR_OFF;
+       break;
+      case PROCESSOR_GFX90a:
+       flag_xnack = HSACO_ATTR_ANY;
+       break;
+      default:
+       gcc_unreachable ();
+      }
+
+  if (flag_sram_ecc == HSACO_ATTR_DEFAULT)
+    flag_sram_ecc = HSACO_ATTR_ANY;
 }
 
 /* }}}  */
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index c356a0cbb08..32486d9615f 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -97,9 +97,12 @@ Enum(hsaco_attr_type) String(on) Value(HSACO_ATTR_ON)
 EnumValue
 Enum(hsaco_attr_type) String(any) Value(HSACO_ATTR_ANY)
 
+EnumValue
+Enum(hsaco_attr_type) String(default) Value(HSACO_ATTR_DEFAULT)
+
 mxnack=
-Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_xnack) 
Init(HSACO_ATTR_ANY)
-Compile for devices requiring XNACK enabled. Default \"any\".
+Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_xnack) 
Init(HSACO_ATTR_DEFAULT)
+Compile for devices requiring XNACK enabled. Default \"any\" if USM is 
supported.
 
 msram-ecc=
 Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_sram_ecc) 
Init(HSACO_ATTR_ANY)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index db039c47220..8f885b8c6d6 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21628,7 +21628,8 @@ run-time performance.  The default is 32KB when using 
OpenACC or OpenMP, and
 Compile binaries suitable for devices with the XNACK feature enabled, disabled,
 or either mode.  Some devices always require XNACK and some allow the user to
 configure XNACK.  The compiled code must match the device mode.
-The default is @samp{-mxnack=any}.
+The default is @samp{-mxnack=any} on devices that support Unified Shared
+Memory, and @samp{-mxnack=no} otherwise.
 
 @end table
 
amdgcn: Support XNACK mode

The XNACK feature allows memory load instructions to restart safely following
a page-miss interrupt.  This is useful for shared-memory devices, like APUs,
and to implement OpenMP Unified Shared Memory.

To support the feature we must be able to set the appropriate meta-data and
set the load instructions to early-clobber.  When the port supports scheduling
of s_waitcnt instructions there will be further requirements.

gcc/ChangeLog:

        * config/gcn/gcn-hsa.h (NO_XNACK): Ignore missing -march.
        (XNACKOPT): Match on/off; ignore any.
        * config/gcn/gcn-valu.md (gather<mode>_insn_1offset<exec>):
        Add xnack compatible alternatives.
        (gather<mode>_insn_2offsets<exec>): Likewise.
        * config/gcn/gcn.cc (gcn_option_override): Permit -mxnack for devices
        other than Fiji and gfx1030.
        (gcn_expand_epilogue): Remove early-clobber problems.
        (gcn_hsa_declare_function_name): Obey -mxnack setting.
        * config/gcn/gcn.md (xnack): New attribute.
        (enabled): Rework to include "xnack" attribute.
        (*movbi): Add xnack compatible alternatives.
        (*mov<mode>_insn): Likewise.
        (*mov<mode>_insn): Likewise.
        (*mov<mode>_insn): Likewise.
        (*movti_insn): Likewise.
        * config/gcn/gcn.opt (-mxnack): Change the default to "any".
        * doc/invoke.texi: Remove placeholder notice for -mxnack.

diff --git a/gcc/config/gcn/gcn-hsa.h b/gcc/config/gcn/gcn-hsa.h
index 4d72299da9b..bfb104526c5 100644
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@@ -75,16 +75,13 @@ extern unsigned int gcn_local_sym_hash (const char *name);
    supported for gcn.  */
 #define GOMP_SELF_SPECS ""
 
-#define NO_XNACK "!march=*:;march=fiji:;march=gfx1030:;"
+#define NO_XNACK "march=fiji:;march=gfx1030:;"
 #define NO_SRAM_ECC "!march=*:;march=fiji:;march=gfx900:;march=gfx906:;"
 
 /* In HSACOv4 no attribute setting means the binary supports "any" hardware
    configuration.  The name of the attribute also changed.  */
 #define SRAMOPT "msram-ecc=on:-mattr=+sramecc;msram-ecc=off:-mattr=-sramecc"
-
-/* Replace once XNACK is supported:
-   #define XNACKOPT "mxnack=on:-mattr=+xnack;mxnack=off:-mattr=-xnack"  */
-#define XNACKOPT "!mnack=*:-mattr=-xnack;mnack=*:-mattr=-xnack"
+#define XNACKOPT "mxnack=on:-mattr=+xnack;mxnack=off:-mattr=-xnack"
 
 /* Use LLVM assembler and linker options.  */
 #define ASM_SPEC  "-triple=amdgcn--amdhsa "  \
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index a928decd408..64b8ea1057f 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1145,13 +1145,13 @@ (define_expand "gather<mode>_expr<exec>"
     {})
 
 (define_insn "gather<mode>_insn_1offset<exec>"
-  [(set (match_operand:V_MOV 0 "register_operand"                 "=v,a")
+  [(set (match_operand:V_MOV 0 "register_operand"                 "=v,a,&v,&a")
        (unspec:V_MOV
-         [(plus:<VnDI> (match_operand:<VnDI> 1 "register_operand" " v,v")
+         [(plus:<VnDI> (match_operand:<VnDI> 1 "register_operand" " v,v, v, v")
                        (vec_duplicate:<VnDI>
-                         (match_operand 2 "immediate_operand"     " n,n")))
-          (match_operand 3 "immediate_operand"                    " n,n")
-         (match_operand 4 "immediate_operand"                     " n,n")
+                         (match_operand 2 "immediate_operand"     " n,n, n, 
n")))
+          (match_operand 3 "immediate_operand"                    " n,n, n, n")
+          (match_operand 4 "immediate_operand"                    " n,n, n, n")
           (mem:BLK (scratch))]
          UNSPEC_GATHER))]
   "(AS_FLAT_P (INTVAL (operands[3]))
@@ -1182,7 +1182,8 @@ (define_insn "gather<mode>_insn_1offset<exec>"
   }
   [(set_attr "type" "flat")
    (set_attr "length" "12")
-   (set_attr "gcn_version" "*,cdna2")])
+   (set_attr "gcn_version" "*,cdna2,*,cdna2")
+   (set_attr "xnack" "off,off,on,on")])
 
 (define_insn "gather<mode>_insn_1offset_ds<exec>"
   [(set (match_operand:V_MOV 0 "register_operand"                 "=v,a")
@@ -1208,18 +1209,18 @@ (define_insn "gather<mode>_insn_1offset_ds<exec>"
    (set_attr "gcn_version" "*,cdna2")])
 
 (define_insn "gather<mode>_insn_2offsets<exec>"
-  [(set (match_operand:V_MOV 0 "register_operand"                    "=v,a")
+  [(set (match_operand:V_MOV 0 "register_operand"              "=v,a,&v,&a")
        (unspec:V_MOV
          [(plus:<VnDI>
             (plus:<VnDI>
               (vec_duplicate:<VnDI>
-                (match_operand:DI 1 "register_operand"               "Sv,Sv"))
+                (match_operand:DI 1 "register_operand"         "Sv,Sv,Sv,Sv"))
               (sign_extend:<VnDI>
-                (match_operand:<VnSI> 2 "register_operand"           " v,v")))
+                (match_operand:<VnSI> 2 "register_operand"     " v, v, v, v")))
             (vec_duplicate:<VnDI> (match_operand 3 "immediate_operand"
-                                                                     " n,n")))
-          (match_operand 4 "immediate_operand"                       " n,n")
-          (match_operand 5 "immediate_operand"                       " n,n")
+                                                               " n, n, n, n")))
+          (match_operand 4 "immediate_operand"                 " n, n, n, n")
+          (match_operand 5 "immediate_operand"                 " n, n, n, n")
           (mem:BLK (scratch))]
          UNSPEC_GATHER))]
   "(AS_GLOBAL_P (INTVAL (operands[4]))
@@ -1239,7 +1240,8 @@ (define_insn "gather<mode>_insn_2offsets<exec>"
   }
   [(set_attr "type" "flat")
    (set_attr "length" "12")
-   (set_attr "gcn_version" "*,cdna2")])
+   (set_attr "gcn_version" "*,cdna2,*,cdna2")
+   (set_attr "xnack" "off,off,on,on")])
 
 (define_expand "scatter_store<mode><vnsi>"
   [(match_operand:DI 0 "register_operand")
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 031b405e810..d92cd01d03f 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -160,11 +160,18 @@ gcn_option_override (void)
        acc_lds_size = 32768;
     }
 
-  /* The xnack option is a placeholder, for now.  Before removing, update
-     gcn-hsa.h's XNACKOPT, gcn.opt's mxnack= default init+descr, and
-     invoke.texi's default description.  */
-  if (flag_xnack != HSACO_ATTR_OFF)
-    sorry ("XNACK support");
+  /* gfx803 "Fiji" and gfx1030 do not support XNACK.  */
+  if (gcn_arch == PROCESSOR_FIJI
+      || gcn_arch == PROCESSOR_GFX1030)
+    {
+      if (flag_xnack == HSACO_ATTR_ON)
+       error ("-mxnack=on is incompatible with -march=%s",
+              (gcn_arch == PROCESSOR_FIJI ? "fiji"
+               : gcn_arch == PROCESSOR_GFX1030 ? "gfx1030"
+               : NULL));
+      /* Allow HSACO_ATTR_ANY silently because that's the default.  */
+      flag_xnack = HSACO_ATTR_OFF;
+    }
 }
 
 /* }}}  */
@@ -3585,18 +3592,20 @@ gcn_expand_epilogue (void)
       /* Assume that an exit value compatible with gcn-run is expected.
          That is, the third input parameter is an int*.
 
-         We can't allocate any new registers, but the kernarg_reg is
-         dead after this, so we'll use that.  */
+         We can't allocate any new registers, but the dispatch_ptr and
+        kernarg_reg are dead after this, so we'll use those.  */
+      rtx dispatch_ptr_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+                                         [DISPATCH_PTR_ARG]);
       rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
                                     [KERNARG_SEGMENT_PTR_ARG]);
       rtx retptr_mem = gen_rtx_MEM (DImode,
                                    gen_rtx_PLUS (DImode, kernarg_reg,
                                                  GEN_INT (16)));
       set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT);
-      emit_move_insn (kernarg_reg, retptr_mem);
+      emit_move_insn (dispatch_ptr_reg, retptr_mem);
 
       rtx retval_addr = gen_rtx_REG (DImode, FIRST_VPARM_REG + 2);
-      emit_move_insn (retval_addr, kernarg_reg);
+      emit_move_insn (retval_addr, dispatch_ptr_reg);
       rtx retval_mem = gen_rtx_MEM (SImode, retval_addr);
       set_mem_addr_space (retval_mem, ADDR_SPACE_FLAT);
       emit_move_insn (retval_mem, gen_rtx_REG (SImode, RETURN_VALUE_REG));
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index b7fbbaf830b..c7f63d0a3ac 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -295,6 +295,8 @@ (define_attr "length" ""
 (define_attr "gcn_version" "gcn3,gcn5,cdna2" (const_string "gcn3"))
 (define_attr "rdna" "any,no,yes" (const_string "any"))
 
+(define_attr "xnack" "na,off,on" (const_string "na"))
+
 (define_attr "enabled" ""
   (cond [(and (eq_attr "rdna" "no")
              (ne (symbol_ref "TARGET_RDNA2") (const_int 0)))
@@ -302,14 +304,19 @@ (define_attr "enabled" ""
         (and (eq_attr "rdna" "yes")
              (eq (symbol_ref "TARGET_RDNA2") (const_int 0)))
           (const_int 0)
-        (eq_attr "gcn_version" "gcn3") (const_int 1)
         (and (eq_attr "gcn_version" "gcn5")
-             (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
-          (const_int 1)
+             (eq (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+          (const_int 0)
         (and (eq_attr "gcn_version" "cdna2")
-             (ne (symbol_ref "TARGET_CDNA2_PLUS") (const_int 0)))
-          (const_int 1)]
-       (const_int 0)))
+             (eq (symbol_ref "TARGET_CDNA2_PLUS") (const_int 0)))
+          (const_int 0)
+        (and (eq_attr "xnack" "off")
+             (ne (symbol_ref "TARGET_XNACK") (const_int 0)))
+          (const_int 0)
+        (and (eq_attr "xnack" "on")
+             (eq (symbol_ref "TARGET_XNACK") (const_int 0)))
+          (const_int 0)]
+       (const_int 1)))
 
 ; We need to be able to identify v_readlane and v_writelane with
 ; SGPR lane selection in order to handle "Manually Inserted Wait States".
@@ -508,9 +515,9 @@ (define_split
 
 (define_insn "*movbi"
   [(set (match_operand:BI 0 "nonimmediate_operand"
-                                   "=Sg,   v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
+                         "=Sg,   v,Sg,cs,cV,cV,Sm,&Sm,RS, v,&v,RF, v,&v,RM")
        (match_operand:BI 1 "gcn_load_operand"
-                                   "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
+                         "SSA,vSvA, v,SS, v,SS,RS, RS,Sm,RF,RF, v,RM,RM, v"))]
   ""
   {
     /* SCC as an operand is currently not accepted by the LLVM assembler, so
@@ -537,25 +544,29 @@ (define_insn "*movbi"
       return "s_mov_b32\tvcc_lo, %1\;"
             "s_mov_b32\tvcc_hi, 0";
     case 6:
-      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
     case 7:
-      return "s_store_dword\t%1, %A0";
+      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
     case 8:
-      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
+      return "s_store_dword\t%1, %A0";
     case 9:
-      return "flat_store_dword\t%A0, %1%O0%g0";
     case 10:
-      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
     case 11:
+      return "flat_store_dword\t%A0, %1%O0%g0";
+    case 12:
+    case 13:
+      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
+    case 14:
       return "global_store_dword\t%A0, %1%O0%g0";
     default:
       gcc_unreachable ();
     }
   }
-  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
-                    flat,flat")
-   (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*")
-   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
+  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,smem,flat,flat,
+                    flat,flat,flat,flat")
+   (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*,*,*,*")
+   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12,12,12,12")
+   (set_attr "xnack" "*,*,*,*,*,*,off,on,*,off,on,*,off,on,*")])
 
 ; 32bit move pattern
 
@@ -563,32 +574,38 @@ (define_insn "*mov<mode>_insn"
   [(set (match_operand:SISF 0 "nonimmediate_operand")
        (match_operand:SISF 1 "gcn_load_operand"))]
   ""
-  {@ [cons: =0, 1; attrs: type, exec, length, gcn_version]
-   [SD  ,SSA ;sop1 ,*   ,4 ,*    ] s_mov_b32\t%0, %1
-   [SD  ,J   ;sopk ,*   ,4 ,*    ] s_movk_i32\t%0, %1
-   [SD  ,B   ;sop1 ,*   ,8 ,*    ] s_mov_b32\t%0, %1
-   [SD  ,RB  ;smem ,*   ,12,*    ] s_buffer_load%s0\t%0, s[0:3], 
%1\;s_waitcnt\tlgkmcnt(0)
-   [RB  ,Sm  ;smem ,*   ,12,*    ] s_buffer_store%s1\t%1, s[0:3], %0
-   [Sm  ,RS  ;smem ,*   ,12,*    ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-   [RS  ,Sm  ;smem ,*   ,12,*    ] s_store_dword\t%1, %A0
-   [v   ,v   ;vop1 ,*   ,4 ,*    ] v_mov_b32\t%0, %1
-   [Sg  ,v   ;vop3a,none,8 ,*    ] v_readlane_b32\t%0, %1, 0
-   [v   ,Sv  ;vop3a,none,8 ,*    ] v_writelane_b32\t%0, %1, 0
-   [v   ,^a  ;vop3p_mai,*,8,*    ] v_accvgpr_read_b32\t%0, %1
-   [a   ,v   ;vop3p_mai,*,8,*    ] v_accvgpr_write_b32\t%0, %1
-   [a   ,a   ;vop1 ,*    ,4,cdna2] v_accvgpr_mov_b32\t%0, %1
-   [v   ,RF  ;flat ,*   ,12,*    ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
-   [^a  ,RF  ;flat ,*   ,12,cdna2] ^
-   [RF  ,v   ;flat ,*   ,12,*    ] flat_store_dword\t%A0, %1%O0%g0
-   [RF  ,a   ;flat ,*   ,12,cdna2] ^
-   [v   ,B   ;vop1 ,*   ,8 ,*    ] v_mov_b32\t%0, %1
-   [RLRG,v   ;ds   ,*   ,12,*    ] ds_write_b32\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
-   [v   ,RLRG;ds   ,*   ,12,*    ] ds_read_b32\t%0, 
%A1%O1\;s_waitcnt\tlgkmcnt(0)
-   [SD  ,Y   ;sop1 ,*   ,8 ,*    ] s_mov_b32\t%0, %1
-   [v   ,RM  ;flat ,*   ,12,*    ] global_load_dword\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
-   [^a  ,RM  ;flat ,*   ,12,cdna2] ^
-   [RM  ,v   ;flat ,*   ,12,*    ] global_store_dword\t%A0, %1%O0%g0
-   [RM  ,a   ;flat ,*   ,12,cdna2] ^
+  {@ [cons: =0, 1; attrs: type, exec, length, gcn_version, xnack]
+   [SD  ,SSA ;sop1 ,*   ,4 ,*    ,*  ] s_mov_b32\t%0, %1
+   [SD  ,J   ;sopk ,*   ,4 ,*    ,*  ] s_movk_i32\t%0, %1
+   [SD  ,B   ;sop1 ,*   ,8 ,*    ,*  ] s_mov_b32\t%0, %1
+   [SD  ,RB  ;smem ,*   ,12,*    ,off] s_buffer_load%s0\t%0, s[0:3], 
%1\;s_waitcnt\tlgkmcnt(0)
+   [&SD ,RB  ;smem ,*   ,12,*    ,on ] ^
+   [RB  ,Sm  ;smem ,*   ,12,*    ,*  ] s_buffer_store%s1\t%1, s[0:3], %0
+   [Sm  ,RS  ;smem ,*   ,12,*    ,off] s_load_dword\t%0, 
%A1\;s_waitcnt\tlgkmcnt(0)
+   [&Sm ,RS  ;smem ,*   ,12,*    ,on ] ^
+   [RS  ,Sm  ;smem ,*   ,12,*    ,*  ] s_store_dword\t%1, %A0
+   [v   ,v   ;vop1 ,*   ,4 ,*    ,*  ] v_mov_b32\t%0, %1
+   [Sg  ,v   ;vop3a,none,8 ,*    ,*  ] v_readlane_b32\t%0, %1, 0
+   [v   ,Sv  ;vop3a,none,8 ,*    ,*  ] v_writelane_b32\t%0, %1, 0
+   [v   ,^a  ;vop3p_mai,*,8,*    ,*  ] v_accvgpr_read_b32\t%0, %1
+   [a   ,v   ;vop3p_mai,*,8,*    ,*  ] v_accvgpr_write_b32\t%0, %1
+   [a   ,a   ;vop1 ,*    ,4,cdna2,*  ] v_accvgpr_mov_b32\t%0, %1
+   [v   ,RF  ;flat ,*   ,12,*    ,off] flat_load_dword\t%0, 
%A1%O1%g1\;s_waitcnt\t0
+   [&v  ,RF  ;flat ,*   ,12,*    ,on ] ^
+   [^a  ,RF  ;flat ,*   ,12,cdna2,off] ^
+   [&^a ,RF  ;flat ,*   ,12,cdna2,on ] ^
+   [RF  ,v   ;flat ,*   ,12,*    ,*  ] flat_store_dword\t%A0, %1%O0%g0
+   [RF  ,a   ;flat ,*   ,12,cdna2,*  ] ^
+   [v   ,B   ;vop1 ,*   ,8 ,*    ,*  ] v_mov_b32\t%0, %1
+   [RLRG,v   ;ds   ,*   ,12,*    ,*  ] ds_write_b32\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
+   [v   ,RLRG;ds   ,*   ,12,*    ,*  ] ds_read_b32\t%0, 
%A1%O1\;s_waitcnt\tlgkmcnt(0)
+   [SD  ,Y   ;sop1 ,*   ,8 ,*    ,*  ] s_mov_b32\t%0, %1
+   [v   ,RM  ;flat ,*   ,12,*    ,off] global_load_dword\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
+   [&v  ,RM  ;flat ,*   ,12,*    ,on ] ^
+   [^a  ,RM  ;flat ,*   ,12,cdna2,off] ^
+   [&^a ,RM  ;flat ,*   ,12,cdna2,on ] ^
+   [RM  ,v   ;flat ,*   ,12,*    ,*  ] global_store_dword\t%A0, %1%O0%g0
+   [RM  ,a   ;flat ,*   ,12,cdna2,*  ] ^
   })
 
 ; 8/16bit move pattern
@@ -598,27 +615,31 @@ (define_insn "*mov<mode>_insn"
   [(set (match_operand:QIHI 0 "nonimmediate_operand")
        (match_operand:QIHI 1 "gcn_load_operand"))]
   "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
-  {@ [cons: =0, 1; attrs: type, exec, length, gcn_version]
-  [SD  ,SSA ;sop1 ,*   ,4 ,*    ] s_mov_b32\t%0, %1
-  [SD  ,J   ;sopk ,*   ,4 ,*    ] s_movk_i32\t%0, %1
-  [SD  ,B   ;sop1 ,*   ,8 ,*    ] s_mov_b32\t%0, %1
-  [v   ,v   ;vop1 ,*   ,4 ,*    ] v_mov_b32\t%0, %1
-  [Sg  ,v   ;vop3a,none,4 ,*    ] v_readlane_b32\t%0, %1, 0
-  [v   ,Sv  ;vop3a,none,4 ,*    ] v_writelane_b32\t%0, %1, 0
-  [v   ,^a  ;vop3p_mai,*,8,*    ] v_accvgpr_read_b32\t%0, %1
-  [a   ,v   ;vop3p_mai,*,8,*    ] v_accvgpr_write_b32\t%0, %1
-  [a   ,a   ;vop1 ,*    ,8,cdna2] v_accvgpr_mov_b32\t%0, %1
-  [v    ,RF ;flat ,*   ,12,*    ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
-  [^a   ,RF ;flat ,*   ,12,cdna2] ^
-  [RF  ,v   ;flat ,*   ,12,*    ] flat_store%s0\t%A0, %1%O0%g0
-  [RF  ,a   ;flat ,*   ,12,cdna2] ^
-  [v   ,B   ;vop1 ,*   ,8 ,*    ] v_mov_b32\t%0, %1
-  [RLRG,v   ;ds   ,*   ,12,*    ] ds_write%b0\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
-  [v   ,RLRG;ds   ,*   ,12,*    ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
-  [v   ,RM  ;flat ,*   ,12,*    ] global_load%o1\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  [^a  ,RM  ;flat ,*   ,12,cdna2] ^
-  [RM  ,v   ;flat ,*   ,12,*    ] global_store%s0\t%A0, %1%O0%g0
-  [RM  ,a   ;flat ,*   ,12,cdna2] ^
+  {@ [cons: =0, 1; attrs: type, exec, length, gcn_version, xnack]
+  [SD  ,SSA ;sop1 ,*   ,4 ,*    ,*  ] s_mov_b32\t%0, %1
+  [SD  ,J   ;sopk ,*   ,4 ,*    ,*  ] s_movk_i32\t%0, %1
+  [SD  ,B   ;sop1 ,*   ,8 ,*    ,*  ] s_mov_b32\t%0, %1
+  [v   ,v   ;vop1 ,*   ,4 ,*    ,*  ] v_mov_b32\t%0, %1
+  [Sg  ,v   ;vop3a,none,4 ,*    ,*  ] v_readlane_b32\t%0, %1, 0
+  [v   ,Sv  ;vop3a,none,4 ,*    ,*  ] v_writelane_b32\t%0, %1, 0
+  [v   ,^a  ;vop3p_mai,*,8,*    ,*  ] v_accvgpr_read_b32\t%0, %1
+  [a   ,v   ;vop3p_mai,*,8,*    ,*  ] v_accvgpr_write_b32\t%0, %1
+  [a   ,a   ;vop1 ,*    ,8,cdna2,*  ] v_accvgpr_mov_b32\t%0, %1
+  [v   ,RF  ;flat ,*   ,12,*    ,off] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+  [&v  ,RF  ;flat ,*   ,12,*    ,on ] ^
+  [^a  ,RF  ;flat ,*   ,12,cdna2,off] ^
+  [&^a ,RF  ;flat ,*   ,12,cdna2,on ] ^
+  [RF  ,v   ;flat ,*   ,12,*    ,*  ] flat_store%s0\t%A0, %1%O0%g0
+  [RF  ,a   ;flat ,*   ,12,cdna2,*  ] ^
+  [v   ,B   ;vop1 ,*   ,8 ,*    ,*  ] v_mov_b32\t%0, %1
+  [RLRG,v   ;ds   ,*   ,12,*    ,*  ] ds_write%b0\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
+  [v   ,RLRG;ds   ,*   ,12,*    ,*  ] ds_read%u1\t%0, 
%A1%O1\;s_waitcnt\tlgkmcnt(0)
+  [v   ,RM  ;flat ,*   ,12,*    ,off] global_load%o1\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  [&v  ,RM  ;flat ,*   ,12,*    ,on ] ^
+  [^a  ,RM  ;flat ,*   ,12,cdna2,off] ^
+  [&^a ,RM  ;flat ,*   ,12,cdna2,on ] ^
+  [RM  ,v   ;flat ,*   ,12,*    ,*  ] global_store%s0\t%A0, %1%O0%g0
+  [RM  ,a   ;flat ,*   ,12,cdna2,*  ] ^
   })
 
 ; 64bit move pattern
@@ -627,29 +648,34 @@ (define_insn_and_split "*mov<mode>_insn"
   [(set (match_operand:DIDF 0 "nonimmediate_operand")
        (match_operand:DIDF 1 "general_operand"))]
   "GET_CODE(operands[1]) != SYMBOL_REF"
-  {@ [cons: =0, 1; attrs: type, length, gcn_version]
-  [SD  ,SSA ;sop1 ,4 ,*    ] s_mov_b64\t%0, %1
-  [SD  ,C   ;sop1 ,8 ,*    ] ^
-  [SD  ,DB  ;mult ,* ,*    ] #
-  [RS  ,Sm  ;smem ,12,*    ] s_store_dwordx2\t%1, %A0
-  [Sm  ,RS  ;smem ,12,*    ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  [v   ,v   ;vmult,* ,*    ] #
-  [v   ,DB  ;vmult,* ,*    ] #
-  [Sg  ,v   ;vmult,* ,*    ] #
-  [v   ,Sv  ;vmult,* ,*    ] #
-  [v   ,^a  ;vmult,* ,*    ] #
-  [a   ,v   ;vmult,* ,*    ] #
-  [a   ,a   ;vmult,* ,cdna2] #
-  [v   ,RF  ;flat ,12,*    ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
-  [^a  ,RF  ;flat ,12,cdna2] ^
-  [RF  ,v   ;flat ,12,*    ] flat_store_dwordx2\t%A0, %1%O0%g0
-  [RF  ,a   ;flat ,12,cdna2] ^
-  [RLRG,v   ;ds   ,12,*    ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
-  [v   ,RLRG;ds   ,12,*    ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
-  [v   ,RM  ;flat ,12,*    ] global_load_dwordx2\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  [^a  ,RM  ;flat ,12,cdna2] ^
-  [RM  ,v   ;flat ,12,*    ] global_store_dwordx2\t%A0, %1%O0%g0
-  [RM  ,a   ;flat ,12,cdna2] ^
+  {@ [cons: =0, 1; attrs: type, length, gcn_version, xnack]
+  [SD  ,SSA ;sop1 ,4 ,*    ,*  ] s_mov_b64\t%0, %1
+  [SD  ,C   ;sop1 ,8 ,*    ,*  ] ^
+  [SD  ,DB  ;mult ,* ,*    ,*  ] #
+  [RS  ,Sm  ;smem ,12,*    ,*  ] s_store_dwordx2\t%1, %A0
+  [Sm  ,RS  ;smem ,12,*    ,off] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+  [&Sm ,RS  ;smem ,12,*    ,on ] ^
+  [v   ,v   ;vmult,* ,*    ,*  ] #
+  [v   ,DB  ;vmult,* ,*    ,*  ] #
+  [Sg  ,v   ;vmult,* ,*    ,*  ] #
+  [v   ,Sv  ;vmult,* ,*    ,*  ] #
+  [v   ,^a  ;vmult,* ,*    ,*  ] #
+  [a   ,v   ;vmult,* ,*    ,*  ] #
+  [a   ,a   ;vmult,* ,cdna2,*  ] #
+  [v   ,RF  ;flat ,12,*    ,off] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+  [&v  ,RF  ;flat ,12,*    ,on ] ^
+  [^a  ,RF  ;flat ,12,cdna2,off] ^
+  [&^a ,RF  ;flat ,12,cdna2,on ] ^
+  [RF  ,v   ;flat ,12,*    ,*  ] flat_store_dwordx2\t%A0, %1%O0%g0
+  [RF  ,a   ;flat ,12,cdna2,*  ] ^
+  [RLRG,v   ;ds   ,12,*    ,*  ] ds_write_b64\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
+  [v   ,RLRG;ds   ,12,*    ,*  ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+  [v   ,RM  ;flat ,12,*    ,off] global_load_dwordx2\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  [&v  ,RM  ;flat ,12,*    ,on ] ^
+  [^a  ,RM  ;flat ,12,cdna2,off] ^
+  [&^a ,RM  ;flat ,12,cdna2,on ] ^
+  [RM  ,v   ;flat ,12,*    ,*  ] global_store_dwordx2\t%A0, %1%O0%g0
+  [RM  ,a   ;flat ,12,cdna2,*  ] ^
   }
   "reload_completed
    && ((!MEM_P (operands[0]) && !MEM_P (operands[1])
@@ -687,26 +713,31 @@ (define_insn_and_split "*movti_insn"
   [(set (match_operand:TI 0 "nonimmediate_operand")
        (match_operand:TI 1 "general_operand"  ))]
   ""
-  {@ [cons: =0, 1; attrs: type, delayeduse, length, gcn_version]
-  [SD,SSB;mult ,*  ,* ,*    ] #
-  [RS,Sm ;smem ,*  ,12,*    ] s_store_dwordx4\t%1, %A0
-  [Sm,RS ;smem ,yes,12,*    ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
-  [RF,v  ;flat ,*  ,12,*    ] flat_store_dwordx4\t%A0, %1%O0%g0
-  [RF,a  ;flat ,*  ,12,cdna2] ^
-  [v ,RF ;flat ,*  ,12,*    ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
-  [^a,RF ;flat ,*  ,12,cdna2] ^
-  [v ,v  ;vmult,*  ,* ,*    ] #
-  [v ,Sv ;vmult,*  ,* ,*    ] #
-  [SD,v  ;vmult,*  ,* ,*    ] #
-  [RM,v  ;flat ,yes,12,*    ] global_store_dwordx4\t%A0, %1%O0%g0
-  [RM,a  ;flat ,yes,12,cdna2] ^
-  [v ,RM ;flat ,*  ,12,*    ] global_load_dwordx4\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
-  [^a,RM ;flat ,*  ,12,cdna2] ^
-  [RL,v  ;ds   ,*  ,12,*    ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
-  [v ,RL ;ds   ,*  ,12,*    ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
-  [v ,^a ;vmult,*  ,* ,*    ] #
-  [a ,v  ;vmult,*  ,* ,*    ] #
-  [a ,a  ;vmult,*  ,* ,cdna2] #
+  {@ [cons: =0, 1; attrs: type, delayeduse, length, gcn_version, xnack]
+  [SD ,SSB;mult ,*  ,* ,*    ,*  ] #
+  [RS ,Sm ;smem ,*  ,12,*    ,*  ] s_store_dwordx4\t%1, %A0
+  [Sm ,RS ;smem ,yes,12,*    ,off] s_load_dwordx4\t%0, 
%A1\;s_waitcnt\tlgkmcnt(0)
+  [&Sm,RS ;smem ,yes,12,*    ,on ] ^
+  [RF ,v  ;flat ,*  ,12,*    ,*  ] flat_store_dwordx4\t%A0, %1%O0%g0
+  [RF ,a  ;flat ,*  ,12,cdna2,*  ] ^
+  [v  ,RF ;flat ,*  ,12,*    ,off] flat_load_dwordx4\t%0, 
%A1%O1%g1\;s_waitcnt\t0
+  [&v ,RF ;flat ,*  ,12,*    ,on ] ^
+  [^a ,RF ;flat ,*  ,12,cdna2,off] ^
+  [&^a,RF ;flat ,*  ,12,cdna2,on ] ^
+  [v  ,v  ;vmult,*  ,* ,*    ,*  ] #
+  [v  ,Sv ;vmult,*  ,* ,*    ,*  ] #
+  [SD ,v  ;vmult,*  ,* ,*    ,*  ] #
+  [RM ,v  ;flat ,yes,12,*    ,*  ] global_store_dwordx4\t%A0, %1%O0%g0
+  [RM ,a  ;flat ,yes,12,cdna2,*  ] ^
+  [v  ,RM ;flat ,*  ,12,*    ,off] global_load_dwordx4\t%0, 
%A1%O1%g1\;s_waitcnt\tvmcnt(0)
+  [&v ,RM ;flat ,*  ,12,*    ,on ] ^
+  [^a ,RM ;flat ,*  ,12,cdna2,off] ^
+  [&^a,RM ;flat ,*  ,12,cdna2,on ] ^
+  [RL ,v  ;ds   ,*  ,12,*    ,*  ] ds_write_b128\t%A0, 
%1%O0\;s_waitcnt\tlgkmcnt(0)
+  [v  ,RL ;ds   ,*  ,12,*    ,*  ] ds_read_b128\t%0, 
%A1%O1\;s_waitcnt\tlgkmcnt(0)
+  [v  ,^a ;vmult,*  ,* ,*    ,*  ] #
+  [a  ,v  ;vmult,*  ,* ,*    ,*  ] #
+  [a  ,a  ;vmult,*  ,* ,cdna2,*  ] #
   }
   "reload_completed
    && REG_P (operands[0])
@@ -889,6 +920,8 @@ (define_insn "movdi_symbol"
   (clobber (reg:BI SCC_REG))]
  "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
   {
+    /* This s_load may not be XNACK-safe on devices where the GOT may fault.
+       DGPUs are most likely fine.  */
     if (SYMBOL_REF_P (operands[1])
        && SYMBOL_REF_WEAK (operands[1]))
        return "s_getpc_b64\t%0\;"
@@ -913,6 +946,8 @@ (define_insn "movdi_symbol_save_scc"
   {
     /* !!! These sequences clobber CC_SAVE_REG.  */
 
+    /* This s_load may not be XNACK-safe on devices where the GOT may fault.
+       DGPUs are most likely fine.  */
     if (SYMBOL_REF_P (operands[1])
        && SYMBOL_REF_WEAK (operands[1]))
        return "s_mov_b32\ts22, scc\;"
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index e5db6df92d7..c356a0cbb08 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -98,8 +98,8 @@ EnumValue
 Enum(hsaco_attr_type) String(any) Value(HSACO_ATTR_ANY)
 
 mxnack=
-Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_xnack) 
Init(HSACO_ATTR_OFF)
-Compile for devices requiring XNACK enabled. Default \"off\".
+Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_xnack) 
Init(HSACO_ATTR_ANY)
+Compile for devices requiring XNACK enabled. Default \"any\".
 
 msram-ecc=
 Target RejectNegative Joined ToLower Enum(hsaco_attr_type) Var(flag_sram_ecc) 
Init(HSACO_ATTR_ANY)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 19feba467a4..db039c47220 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21628,8 +21628,7 @@ run-time performance.  The default is 32KB when using 
OpenACC or OpenMP, and
 Compile binaries suitable for devices with the XNACK feature enabled, disabled,
 or either mode.  Some devices always require XNACK and some allow the user to
 configure XNACK.  The compiled code must match the device mode.
-@c The default is @samp{-mxnack=any}.
-At present this option is a placeholder for support that is not yet 
implemented.
+The default is @samp{-mxnack=any}.
 
 @end table
 

Reply via email to