[gem5-dev] Change in gem5/gem5[develop]: arch-vega: Implement non-carry-out VEGA add, sub, and subrev

Michael Boyer (Gerrit) via gem5-dev Fri, 25 Jun 2021 13:57:09 -0700

Michael Boyer has uploaded this change for review. (https://gem5-review.googlesource.com/c/public/gem5/+/47240 )


Change subject: arch-vega: Implement non-carry-out VEGA add, sub, and subrev
......................................................................

arch-vega: Implement non-carry-out VEGA add, sub, and subrev

In GCN3, the v_add_u32, v_sub_u32, and v_subrev_u32 instructions write
the carry-out value to VCC. VEGA introduces explicit carry-out versions
of these instructions (v_add_co_u32, v_sub_co_u32, and v_subrev_co_u32),
and modifies the behavior of the baseline, non-carry-out versions to not
write to VCC. Previously both the carry-out and non-carry-out versions
shared a single implementation that wrote to VCC. This patch correctly
implements the non-carry-out versions to avoid the VCC write.

This patch also makes the following substitutions for GCN3 instructions
that no longer exist in VEGA (this renaming has no functional impact):
v_addc_u32 -> v_addc_co_u32
v_subb_u32 -> v_subb_co_u32
v_subbrev_u32 -> v_subbrev_co_u32

Change-Id: I002fa6e9316d38fd4cc3554daff047523cfc12c9
---
M src/arch/amdgpu/vega/decoder.cc
M src/arch/amdgpu/vega/gpu_decoder.hh
M src/arch/amdgpu/vega/insts/instructions.cc
M src/arch/amdgpu/vega/insts/instructions.hh
4 files changed, 415 insertions(+), 172 deletions(-)

diff --git a/src/arch/amdgpu/vega/decoder.ccb/src/arch/amdgpu/vega/decoder.cc

index 5be0d3d..363f7e1 100644
--- a/src/arch/amdgpu/vega/decoder.cc
+++ b/src/arch/amdgpu/vega/decoder.cc
@@ -849,12 +849,12 @@
         &Decoder::decode_OPU_VOP3__V_MAC_F32,
         &Decoder::decode_invalid,
         &Decoder::decode_invalid,
-        &Decoder::decode_OPU_VOP3__V_ADD_U32,
-        &Decoder::decode_OPU_VOP3__V_SUB_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBREV_U32,
-        &Decoder::decode_OPU_VOP3__V_ADDC_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBB_U32,
-        &Decoder::decode_OPU_VOP3__V_SUBBREV_U32,
+        &Decoder::decode_OPU_VOP3__V_ADD_CO_U32,
+        &Decoder::decode_OPU_VOP3__V_SUB_CO_U32,
+        &Decoder::decode_OPU_VOP3__V_SUBREV_CO_U32,
+        &Decoder::decode_OPU_VOP3__V_ADDC_CO_U32,
+        &Decoder::decode_OPU_VOP3__V_SUBB_CO_U32,
+        &Decoder::decode_OPU_VOP3__V_SUBBREV_CO_U32,
         &Decoder::decode_OPU_VOP3__V_ADD_F16,
         &Decoder::decode_OPU_VOP3__V_SUB_F16,
         &Decoder::decode_OPU_VOP3__V_SUBREV_F16,
@@ -3993,37 +3993,37 @@
     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_ADD_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_ADD_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_ADD_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_ADD_CO_U32

     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_SUB_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_SUB_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_SUB_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_SUB_CO_U32

     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_SUBREV_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_SUBREV_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_SUBREV_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_SUBREV_CO_U32

     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_ADDC_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_ADDC_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_ADDC_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_ADDC_CO_U32

     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_SUBB_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_SUBB_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_SUBB_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_SUBB_CO_U32

     GPUStaticInst*
     Decoder::decode_OP_VOP2__V_SUBBREV_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP2__V_SUBBREV_U32(&iFmt->iFmt_VOP2);
+        return new Inst_VOP2__V_SUBBREV_CO_U32(&iFmt->iFmt_VOP2);
     } // decode_OP_VOP2__V_SUBBREV_CO_U32

     GPUStaticInst*
@@ -5947,40 +5947,40 @@
     } // decode_OPU_VOP3__V_MAC_F32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADD_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_ADD_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_ADD_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_ADD_U32
+        return new Inst_VOP3__V_ADD_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_ADD_CO_U32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUB_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_SUB_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_SUB_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_SUB_U32
+        return new Inst_VOP3__V_SUB_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_SUB_CO_U32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBREV_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_SUBREV_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_SUBREV_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_SUBREV_U32
+        return new Inst_VOP3__V_SUBREV_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_SUBREV_CO_U32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_ADDC_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_ADDC_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_ADDC_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_ADDC_U32
+        return new Inst_VOP3__V_ADDC_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_ADDC_CO_U32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBB_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_SUBB_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_SUBB_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_SUBB_U32
+        return new Inst_VOP3__V_SUBB_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_SUBB_CO_U32

     GPUStaticInst*
-    Decoder::decode_OPU_VOP3__V_SUBBREV_U32(MachInst iFmt)
+    Decoder::decode_OPU_VOP3__V_SUBBREV_CO_U32(MachInst iFmt)
     {
-        return new Inst_VOP3__V_SUBBREV_U32(&iFmt->iFmt_VOP3B);
-    } // decode_OPU_VOP3__V_SUBBREV_U32
+        return new Inst_VOP3__V_SUBBREV_CO_U32(&iFmt->iFmt_VOP3B);
+    } // decode_OPU_VOP3__V_SUBBREV_CO_U32

     GPUStaticInst*
     Decoder::decode_OPU_VOP3__V_ADD_F16(MachInst iFmt)

diff --git a/src/arch/amdgpu/vega/gpu_decoder.hhb/src/arch/amdgpu/vega/gpu_decoder.hh

index 69954f8..0159589 100644
--- a/src/arch/amdgpu/vega/gpu_decoder.hh
+++ b/src/arch/amdgpu/vega/gpu_decoder.hh
@@ -296,12 +296,12 @@
         GPUStaticInst* decode_OPU_VOP3__V_OR_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_XOR_B32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_MAC_F32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADD_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUB_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_ADDC_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBB_U32(MachInst);
-        GPUStaticInst* decode_OPU_VOP3__V_SUBBREV_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_ADD_CO_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUB_CO_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUBREV_CO_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_ADDC_CO_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUBB_CO_U32(MachInst);
+        GPUStaticInst* decode_OPU_VOP3__V_SUBBREV_CO_U32(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_ADD_F16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUB_F16(MachInst);
         GPUStaticInst* decode_OPU_VOP3__V_SUBREV_F16(MachInst);

diff --git a/src/arch/amdgpu/vega/insts/instructions.ccb/src/arch/amdgpu/vega/insts/instructions.cc

index 5db3e54..b0a6cb0 100644
--- a/src/arch/amdgpu/vega/insts/instructions.cc
+++ b/src/arch/amdgpu/vega/insts/instructions.cc
@@ -6988,19 +6988,19 @@

         vdst.write();
     } // execute
-    // --- Inst_VOP2__V_ADD_U32 class methods ---
+    // --- Inst_VOP2__V_ADD_CO_U32 class methods ---

-    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_add_u32")
+    Inst_VOP2__V_ADD_CO_U32::Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_ADD_U32
+    } // Inst_VOP2__V_ADD_CO_U32

-    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
+    Inst_VOP2__V_ADD_CO_U32::~Inst_VOP2__V_ADD_CO_U32()
     {
-    } // ~Inst_VOP2__V_ADD_U32
+    } // ~Inst_VOP2__V_ADD_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u + S1.u;
@@ -7008,7 +7008,7 @@
     // ---  overflow or carry-out for V_ADDC_U32.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
-    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7031,8 +7031,8 @@
             origSrc0_sdwa.read();
             origSrc1.read();

- DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: registerv[%d], "

-                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "
+            DPRINTF(VEGA, "Handling V_ADD_CO_U32 SRC SDWA. SRC0: register "

+ "v[%d], DST_SEL: %d, DST_U: %d, CLMP: %d,SRC0_SEL: %d, ""SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d,SRC1_SEL: %d, "

                     "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

extData.iFmt_VOP_SDWA.SRC0,extData.iFmt_VOP_SDWA.DST_SEL,

@@ -7073,19 +7073,19 @@
         vcc.write();
         vdst.write();
     } // execute
-    // --- Inst_VOP2__V_SUB_U32 class methods ---
+    // --- Inst_VOP2__V_SUB_CO_U32 class methods ---

-    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_sub_u32")
+    Inst_VOP2__V_SUB_CO_U32::Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_SUB_U32
+    } // Inst_VOP2__V_SUB_CO_U32

-    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
+    Inst_VOP2__V_SUB_CO_U32::~Inst_VOP2__V_SUB_CO_U32()
     {
-    } // ~Inst_VOP2__V_SUB_U32
+    } // ~Inst_VOP2__V_SUB_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u - S1.u;
@@ -7093,7 +7093,7 @@
     // carry-out for V_SUBB_U32.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
-    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7114,28 +7114,27 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
+    // --- Inst_VOP2__V_SUBREV_CO_U32 class methods ---

-    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subrev_u32")

+ Inst_VOP2__V_SUBREV_CO_U32::Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2*iFmt)

+        : Inst_VOP2(iFmt, "v_subrev_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_SUBREV_U32
+    } // Inst_VOP2__V_SUBREV_CO_U32

-    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
+    Inst_VOP2__V_SUBREV_CO_U32::~Inst_VOP2__V_SUBREV_CO_U32()
     {
-    } // ~Inst_VOP2__V_SUBREV_U32
+    } // ~Inst_VOP2__V_SUBREV_CO_U32

     // --- description from .arch file ---
     // D.u = S1.u - S0.u;
     // VCC[threadId] = (S0.u > S1.u ? 1 : 0) is an UNSIGNED overflow or
     // carry-out for V_SUBB_U32.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
-    // SQ translates this to V_SUB_U32 with reversed operands.
     void
-    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7156,20 +7155,20 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP2__V_ADDC_U32 class methods ---
+    // --- Inst_VOP2__V_ADDC_CO_U32 class methods ---

-    Inst_VOP2__V_ADDC_U32::Inst_VOP2__V_ADDC_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_addc_u32")
+    Inst_VOP2__V_ADDC_CO_U32::Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_addc_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_ADDC_U32
+    } // Inst_VOP2__V_ADDC_CO_U32

-    Inst_VOP2__V_ADDC_U32::~Inst_VOP2__V_ADDC_U32()
+    Inst_VOP2__V_ADDC_CO_U32::~Inst_VOP2__V_ADDC_CO_U32()
     {
-    } // ~Inst_VOP2__V_ADDC_U32
+    } // ~Inst_VOP2__V_ADDC_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u + S1.u + VCC[threadId];
@@ -7178,7 +7177,7 @@

// In VOP3 the VCC destination may be an arbitrary SGPR-pair, and theVCC

     // source comes from the SGPR-pair at S2.u.
     void
-    Inst_VOP2__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7204,20 +7203,20 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP2__V_SUBB_U32 class methods ---
+    // --- Inst_VOP2__V_SUBB_CO_U32 class methods ---

-    Inst_VOP2__V_SUBB_U32::Inst_VOP2__V_SUBB_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subb_u32")
+    Inst_VOP2__V_SUBB_CO_U32::Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subb_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_SUBB_U32
+    } // Inst_VOP2__V_SUBB_CO_U32

-    Inst_VOP2__V_SUBB_U32::~Inst_VOP2__V_SUBB_U32()
+    Inst_VOP2__V_SUBB_CO_U32::~Inst_VOP2__V_SUBB_CO_U32()
     {
-    } // ~Inst_VOP2__V_SUBB_U32
+    } // ~Inst_VOP2__V_SUBB_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u - S1.u - VCC[threadId];
@@ -7226,7 +7225,7 @@

// In VOP3 the VCC destination may be an arbitrary SGPR-pair, and theVCC

     // ---  source comes from the SGPR-pair at S2.u.
     void
-    Inst_VOP2__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7250,20 +7249,20 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP2__V_SUBBREV_U32 class methods ---
+    // --- Inst_VOP2__V_SUBBREV_CO_U32 class methods ---

-    Inst_VOP2__V_SUBBREV_U32::Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2 *iFmt)
-        : Inst_VOP2(iFmt, "v_subbrev_u32")

+ Inst_VOP2__V_SUBBREV_CO_U32::Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2*iFmt)

+        : Inst_VOP2(iFmt, "v_subbrev_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP2__V_SUBBREV_U32
+    } // Inst_VOP2__V_SUBBREV_CO_U32

-    Inst_VOP2__V_SUBBREV_U32::~Inst_VOP2__V_SUBBREV_U32()
+    Inst_VOP2__V_SUBBREV_CO_U32::~Inst_VOP2__V_SUBBREV_CO_U32()
     {
-    } // ~Inst_VOP2__V_SUBBREV_U32
+    } // ~Inst_VOP2__V_SUBBREV_CO_U32

     // --- description from .arch file ---
     // D.u = S1.u - S0.u - VCC[threadId];
@@ -7273,7 +7272,7 @@

// source comes from the SGPR-pair at S2.u. SQ translates toV_SUBB_U32.

     // SQ translates this to V_SUBREV_U32 with reversed operands.
     void
-    Inst_VOP2__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP2__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
@@ -7893,6 +7892,149 @@
     {
         panicUnimplemented();
     } // execute
+    // --- Inst_VOP2__V_ADD_U32 class methods ---
+
+    Inst_VOP2__V_ADD_U32::Inst_VOP2__V_ADD_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_add_u32")
+    {
+        setFlag(ALU);
+        setFlag(ValuCacGrp2);
+    } // Inst_VOP2__V_ADD_U32
+
+    Inst_VOP2__V_ADD_U32::~Inst_VOP2__V_ADD_U32()
+    {
+    } // ~Inst_VOP2__V_ADD_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u + S1.u;
+    void
+    Inst_VOP2__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        VecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        if (isSDWAInst()) {

+ VecOperandU32 src0_sdwa(gpuDynInst,extData.iFmt_VOP_SDWA.SRC0);

+            // use copies of original src0, src1, and dest during selecting
+            VecOperandU32 origSrc0_sdwa(gpuDynInst,
+                                        extData.iFmt_VOP_SDWA.SRC0);
+            VecOperandU32 origSrc1(gpuDynInst, instData.VSRC1);
+            VecOperandU32 origVdst(gpuDynInst, instData.VDST);
+
+            src0_sdwa.read();
+            origSrc0_sdwa.read();
+            origSrc1.read();
+

+ DPRINTF(VEGA, "Handling V_ADD_U32 SRC SDWA. SRC0: registerv[%d], "

+                    "DST_SEL: %d, DST_U: %d, CLMP: %d, SRC0_SEL: %d, "

+ "SRC0_SEXT: %d, SRC0_NEG: %d, SRC0_ABS: %d,SRC1_SEL: %d, "

+                    "SRC1_SEXT: %d, SRC1_NEG: %d, SRC1_ABS: %d\n",

+ extData.iFmt_VOP_SDWA.SRC0,extData.iFmt_VOP_SDWA.DST_SEL,

+                    extData.iFmt_VOP_SDWA.DST_U,
+                    extData.iFmt_VOP_SDWA.CLMP,
+                    extData.iFmt_VOP_SDWA.SRC0_SEL,
+                    extData.iFmt_VOP_SDWA.SRC0_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC0_NEG,
+                    extData.iFmt_VOP_SDWA.SRC0_ABS,
+                    extData.iFmt_VOP_SDWA.SRC1_SEL,
+                    extData.iFmt_VOP_SDWA.SRC1_SEXT,
+                    extData.iFmt_VOP_SDWA.SRC1_NEG,
+                    extData.iFmt_VOP_SDWA.SRC1_ABS);
+

+ processSDWA_src(extData.iFmt_VOP_SDWA, src0_sdwa,origSrc0_sdwa,

+                            src1, origSrc1);
+
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0_sdwa[lane] + src1[lane];
+                    origVdst[lane] = vdst[lane]; // keep copy consistent
+                }
+            }
+
+            processSDWA_dst(extData.iFmt_VOP_SDWA, vdst, origVdst);
+        } else {
+            for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+                if (wf->execMask(lane)) {
+                    vdst[lane] = src0[lane] + src1[lane];
+                }
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUB_U32 class methods ---
+
+    Inst_VOP2__V_SUB_U32::Inst_VOP2__V_SUB_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_sub_u32")
+    {
+        setFlag(ALU);
+        setFlag(ValuCacGrp2);
+    } // Inst_VOP2__V_SUB_U32
+
+    Inst_VOP2__V_SUB_U32::~Inst_VOP2__V_SUB_U32()
+    {
+    } // ~Inst_VOP2__V_SUB_U32
+
+    // --- description from .arch file ---
+    // D.u = S0.u - S1.u;
+    void
+    Inst_VOP2__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src0[lane] - src1[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
+    // --- Inst_VOP2__V_SUBREV_U32 class methods ---
+
+    Inst_VOP2__V_SUBREV_U32::Inst_VOP2__V_SUBREV_U32(InFmt_VOP2 *iFmt)
+        : Inst_VOP2(iFmt, "v_subrev_u32")
+    {
+        setFlag(ALU);
+        setFlag(ValuCacGrp2);
+    } // Inst_VOP2__V_SUBREV_U32
+
+    Inst_VOP2__V_SUBREV_U32::~Inst_VOP2__V_SUBREV_U32()
+    {
+    } // ~Inst_VOP2__V_SUBREV_U32
+
+    // --- description from .arch file ---
+    // D.u = S1.u - S0.u;
+    void
+    Inst_VOP2__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    {
+        Wavefront *wf = gpuDynInst->wavefront();
+        ConstVecOperandU32 src0(gpuDynInst, instData.SRC0);
+        ConstVecOperandU32 src1(gpuDynInst, instData.VSRC1);
+        VecOperandU32 vdst(gpuDynInst, instData.VDST);
+
+        src0.readSrc();
+        src1.read();
+
+        for (int lane = 0; lane < NumVecElemPerVecReg; ++lane) {
+            if (wf->execMask(lane)) {
+                vdst[lane] = src1[lane] - src0[lane];
+            }
+        }
+
+        vdst.write();
+    } // execute
     // --- Inst_VOP1__V_NOP class methods ---

     Inst_VOP1__V_NOP::Inst_VOP1__V_NOP(InFmt_VOP1 *iFmt)
@@ -26157,19 +26299,19 @@

         vdst.write();
     } // execute
-    // --- Inst_VOP3__V_ADD_U32 class methods ---
+    // --- Inst_VOP3__V_ADD_CO_U32 class methods ---

-    Inst_VOP3__V_ADD_U32::Inst_VOP3__V_ADD_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_add_u32")
+    Inst_VOP3__V_ADD_CO_U32::Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_add_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_ADD_U32
+    } // Inst_VOP3__V_ADD_CO_U32

-    Inst_VOP3__V_ADD_U32::~Inst_VOP3__V_ADD_U32()
+    Inst_VOP3__V_ADD_CO_U32::~Inst_VOP3__V_ADD_CO_U32()
     {
-    } // ~Inst_VOP3__V_ADD_U32
+    } // ~Inst_VOP3__V_ADD_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u + S1.u;
@@ -26177,7 +26319,7 @@
     // ---  overflow or carry-out for V_ADDC_U32.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
-    Inst_VOP3__V_ADD_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_ADD_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
@@ -26206,19 +26348,19 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP3__V_SUB_U32 class methods ---
+    // --- Inst_VOP3__V_SUB_CO_U32 class methods ---

-    Inst_VOP3__V_SUB_U32::Inst_VOP3__V_SUB_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_sub_u32")
+    Inst_VOP3__V_SUB_CO_U32::Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_sub_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_SUB_U32
+    } // Inst_VOP3__V_SUB_CO_U32

-    Inst_VOP3__V_SUB_U32::~Inst_VOP3__V_SUB_U32()
+    Inst_VOP3__V_SUB_CO_U32::~Inst_VOP3__V_SUB_CO_U32()
     {
-    } // ~Inst_VOP3__V_SUB_U32
+    } // ~Inst_VOP3__V_SUB_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u - S1.u;
@@ -26226,7 +26368,7 @@
     // carry-out for V_SUBB_U32.
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     void
-    Inst_VOP3__V_SUB_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_SUB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
@@ -26254,20 +26396,20 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP3__V_SUBREV_U32 class methods ---
+    // --- Inst_VOP3__V_SUBREV_CO_U32 class methods ---

-    Inst_VOP3__V_SUBREV_U32::Inst_VOP3__V_SUBREV_U32(
+    Inst_VOP3__V_SUBREV_CO_U32::Inst_VOP3__V_SUBREV_CO_U32(
           InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subrev_u32")
+        : Inst_VOP3B(iFmt, "v_subrev_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_SUBREV_U32
+    } // Inst_VOP3__V_SUBREV_CO_U32

-    Inst_VOP3__V_SUBREV_U32::~Inst_VOP3__V_SUBREV_U32()
+    Inst_VOP3__V_SUBREV_CO_U32::~Inst_VOP3__V_SUBREV_CO_U32()
     {
-    } // ~Inst_VOP3__V_SUBREV_U32
+    } // ~Inst_VOP3__V_SUBREV_CO_U32

     // --- description from .arch file ---
     // D.u = S1.u - S0.u;
@@ -26276,7 +26418,7 @@
     // In VOP3 the VCC destination may be an arbitrary SGPR-pair.
     // SQ translates this to V_SUB_U32 with reversed operands.
     void
-    Inst_VOP3__V_SUBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_SUBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);
@@ -26304,20 +26446,20 @@
         vdst.write();
         vcc.write();
     } // execute
-    // --- Inst_VOP3__V_ADDC_U32 class methods ---
+    // --- Inst_VOP3__V_ADDC_CO_U32 class methods ---

-    Inst_VOP3__V_ADDC_U32::Inst_VOP3__V_ADDC_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_addc_u32")
+    Inst_VOP3__V_ADDC_CO_U32::Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_addc_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_ADDC_U32
+    } // Inst_VOP3__V_ADDC_CO_U32

-    Inst_VOP3__V_ADDC_U32::~Inst_VOP3__V_ADDC_U32()
+    Inst_VOP3__V_ADDC_CO_U32::~Inst_VOP3__V_ADDC_CO_U32()
     {
-    } // ~Inst_VOP3__V_ADDC_U32
+    } // ~Inst_VOP3__V_ADDC_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u + S1.u + VCC[threadId];
@@ -26326,7 +26468,7 @@

// In VOP3 the VCC destination may be an arbitrary SGPR-pair, and theVCC

     // source comes from the SGPR-pair at S2.u.
     void
-    Inst_VOP3__V_ADDC_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_ADDC_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
@@ -26360,20 +26502,20 @@
         vdst.write();
         sdst.write();
     } // execute
-    // --- Inst_VOP3__V_SUBB_U32 class methods ---
+    // --- Inst_VOP3__V_SUBB_CO_U32 class methods ---

-    Inst_VOP3__V_SUBB_U32::Inst_VOP3__V_SUBB_U32(InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subb_u32")
+    Inst_VOP3__V_SUBB_CO_U32::Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B *iFmt)
+        : Inst_VOP3B(iFmt, "v_subb_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_SUBB_U32
+    } // Inst_VOP3__V_SUBB_CO_U32

-    Inst_VOP3__V_SUBB_U32::~Inst_VOP3__V_SUBB_U32()
+    Inst_VOP3__V_SUBB_CO_U32::~Inst_VOP3__V_SUBB_CO_U32()
     {
-    } // ~Inst_VOP3__V_SUBB_U32
+    } // ~Inst_VOP3__V_SUBB_CO_U32

     // --- description from .arch file ---
     // D.u = S0.u - S1.u - VCC[threadId];
@@ -26382,7 +26524,7 @@

// In VOP3 the VCC destination may be an arbitrary SGPR-pair, and theVCC

     // ---  source comes from the SGPR-pair at S2.u.
     void
-    Inst_VOP3__V_SUBB_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_SUBB_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src0(gpuDynInst, extData.SRC0);
@@ -26414,21 +26556,21 @@
         vdst.write();
         sdst.write();
     } // execute
-    // --- Inst_VOP3__V_SUBBREV_U32 class methods ---
+    // --- Inst_VOP3__V_SUBBREV_CO_U32 class methods ---

-    Inst_VOP3__V_SUBBREV_U32::Inst_VOP3__V_SUBBREV_U32(
+    Inst_VOP3__V_SUBBREV_CO_U32::Inst_VOP3__V_SUBBREV_CO_U32(
           InFmt_VOP3B *iFmt)
-        : Inst_VOP3B(iFmt, "v_subbrev_u32")
+        : Inst_VOP3B(iFmt, "v_subbrev_co_u32")
     {
         setFlag(ALU);
         setFlag(WritesVCC);
         setFlag(ReadsVCC);
         setFlag(ValuCacGrp2);
-    } // Inst_VOP3__V_SUBBREV_U32
+    } // Inst_VOP3__V_SUBBREV_CO_U32

-    Inst_VOP3__V_SUBBREV_U32::~Inst_VOP3__V_SUBBREV_U32()
+    Inst_VOP3__V_SUBBREV_CO_U32::~Inst_VOP3__V_SUBBREV_CO_U32()
     {
-    } // ~Inst_VOP3__V_SUBBREV_U32
+    } // ~Inst_VOP3__V_SUBBREV_CO_U32

     // --- description from .arch file ---
     // D.u = S1.u - S0.u - VCC[threadId];
@@ -26436,9 +26578,8 @@
     // overflow.

// In VOP3 the VCC destination may be an arbitrary SGPR-pair, and theVCC// source comes from the SGPR-pair at S2.u. SQ translates toV_SUBB_U32.

-    // SQ translates this to V_SUBREV_U32 with reversed operands.
     void
-    Inst_VOP3__V_SUBBREV_U32::execute(GPUDynInstPtr gpuDynInst)
+    Inst_VOP3__V_SUBBREV_CO_U32::execute(GPUDynInstPtr gpuDynInst)
     {
         Wavefront *wf = gpuDynInst->wavefront();
         ConstVecOperandU32 src1(gpuDynInst, extData.SRC1);

diff --git a/src/arch/amdgpu/vega/insts/instructions.hhb/src/arch/amdgpu/vega/insts/instructions.hh

index 5c0ea8c..b815d3e 100644
--- a/src/arch/amdgpu/vega/insts/instructions.hh
+++ b/src/arch/amdgpu/vega/insts/instructions.hh
@@ -6987,11 +6987,11 @@
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP2__V_MADAK_F32

-    class Inst_VOP2__V_ADD_U32 : public Inst_VOP2
+    class Inst_VOP2__V_ADD_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_ADD_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADD_U32();
+        Inst_VOP2__V_ADD_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_ADD_CO_U32();

         int
         getNumOperands() override
@@ -7021,13 +7021,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADD_U32
+    }; // Inst_VOP2__V_ADD_CO_U32

-    class Inst_VOP2__V_SUB_U32 : public Inst_VOP2
+    class Inst_VOP2__V_SUB_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_SUB_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUB_U32();
+        Inst_VOP2__V_SUB_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUB_CO_U32();

         int
         getNumOperands() override
@@ -7057,13 +7057,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUB_U32
+    }; // Inst_VOP2__V_SUB_CO_U32

-    class Inst_VOP2__V_SUBREV_U32 : public Inst_VOP2
+    class Inst_VOP2__V_SUBREV_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_SUBREV_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBREV_U32();
+        Inst_VOP2__V_SUBREV_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUBREV_CO_U32();

         int
         getNumOperands() override
@@ -7093,13 +7093,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBREV_U32
+    }; // Inst_VOP2__V_SUBREV_CO_U32

-    class Inst_VOP2__V_ADDC_U32 : public Inst_VOP2
+    class Inst_VOP2__V_ADDC_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_ADDC_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_ADDC_U32();
+        Inst_VOP2__V_ADDC_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_ADDC_CO_U32();

         int
         getNumOperands() override
@@ -7131,13 +7131,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_ADDC_U32
+    }; // Inst_VOP2__V_ADDC_CO_U32

-    class Inst_VOP2__V_SUBB_U32 : public Inst_VOP2
+    class Inst_VOP2__V_SUBB_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_SUBB_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBB_U32();
+        Inst_VOP2__V_SUBB_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUBB_CO_U32();

         int
         getNumOperands() override
@@ -7169,13 +7169,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBB_U32
+    }; // Inst_VOP2__V_SUBB_CO_U32

-    class Inst_VOP2__V_SUBBREV_U32 : public Inst_VOP2
+    class Inst_VOP2__V_SUBBREV_CO_U32 : public Inst_VOP2
     {
       public:
-        Inst_VOP2__V_SUBBREV_U32(InFmt_VOP2*);
-        ~Inst_VOP2__V_SUBBREV_U32();
+        Inst_VOP2__V_SUBBREV_CO_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUBBREV_CO_U32();

         int
         getNumOperands() override
@@ -7207,7 +7207,7 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP2__V_SUBBREV_U32
+    }; // Inst_VOP2__V_SUBBREV_CO_U32

     class Inst_VOP2__V_ADD_F16 : public Inst_VOP2
     {
@@ -7927,6 +7927,108 @@
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP2__V_LDEXP_F16

+    class Inst_VOP2__V_ADD_U32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_ADD_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_ADD_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_ADD_U32
+
+    class Inst_VOP2__V_SUB_U32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_SUB_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUB_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_SUB_U32
+
+    class Inst_VOP2__V_SUBREV_U32 : public Inst_VOP2
+    {
+      public:
+        Inst_VOP2__V_SUBREV_U32(InFmt_VOP2*);
+        ~Inst_VOP2__V_SUBREV_U32();
+
+        int
+        getNumOperands() override
+        {
+            return numDstRegOperands() + numSrcRegOperands();
+        } // getNumOperands
+
+        int numDstRegOperands() override { return 1; }
+        int numSrcRegOperands() override { return 2; }
+
+        int
+        getOperandSize(int opIdx) override
+        {
+            switch (opIdx) {
+              case 0: //src_0
+                return 4;
+              case 1: //src_1
+                return 4;
+              case 2: //vdst
+                return 4;
+              default:
+                fatal("op idx %i out of bounds\n", opIdx);
+                return -1;
+            }
+        } // getOperandSize
+
+        void execute(GPUDynInstPtr) override;
+    }; // Inst_VOP2__V_SUBREV_U32
+
     class Inst_VOP1__V_NOP : public Inst_VOP1
     {
       public:
@@ -24637,11 +24739,11 @@
         void execute(GPUDynInstPtr) override;
     }; // Inst_VOP3__V_MAC_F32

-    class Inst_VOP3__V_ADD_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_ADD_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_ADD_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_ADD_U32();
+        Inst_VOP3__V_ADD_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_ADD_CO_U32();

         int
         getNumOperands() override
@@ -24671,13 +24773,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADD_U32
+    }; // Inst_VOP3__V_ADD_CO_U32

-    class Inst_VOP3__V_SUB_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_SUB_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_SUB_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_SUB_U32();
+        Inst_VOP3__V_SUB_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_SUB_CO_U32();

         int
         getNumOperands() override
@@ -24707,13 +24809,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUB_U32
+    }; // Inst_VOP3__V_SUB_CO_U32

-    class Inst_VOP3__V_SUBREV_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_SUBREV_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_SUBREV_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_SUBREV_U32();
+        Inst_VOP3__V_SUBREV_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_SUBREV_CO_U32();

         int
         getNumOperands() override
@@ -24743,13 +24845,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBREV_U32
+    }; // Inst_VOP3__V_SUBREV_CO_U32

-    class Inst_VOP3__V_ADDC_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_ADDC_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_ADDC_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_ADDC_U32();
+        Inst_VOP3__V_ADDC_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_ADDC_CO_U32();

         int
         getNumOperands() override
@@ -24781,13 +24883,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_ADDC_U32
+    }; // Inst_VOP3__V_ADDC_CO_U32

-    class Inst_VOP3__V_SUBB_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_SUBB_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_SUBB_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_SUBB_U32();
+        Inst_VOP3__V_SUBB_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_SUBB_CO_U32();

         int
         getNumOperands() override
@@ -24819,13 +24921,13 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBB_U32
+    }; // Inst_VOP3__V_SUBB_CO_U32

-    class Inst_VOP3__V_SUBBREV_U32 : public Inst_VOP3B
+    class Inst_VOP3__V_SUBBREV_CO_U32 : public Inst_VOP3B
     {
       public:
-        Inst_VOP3__V_SUBBREV_U32(InFmt_VOP3B*);
-        ~Inst_VOP3__V_SUBBREV_U32();
+        Inst_VOP3__V_SUBBREV_CO_U32(InFmt_VOP3B*);
+        ~Inst_VOP3__V_SUBBREV_CO_U32();

         int
         getNumOperands() override
@@ -24857,7 +24959,7 @@
         } // getOperandSize

         void execute(GPUDynInstPtr) override;
-    }; // Inst_VOP3__V_SUBBREV_U32
+    }; // Inst_VOP3__V_SUBBREV_CO_U32

     class Inst_VOP3__V_ADD_F16 : public Inst_VOP3A
     {

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/47240

To unsubscribe, or for help writing mail filters, visithttps://gem5-review.googlesource.com/settings


Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I002fa6e9316d38fd4cc3554daff047523cfc12c9
Gerrit-Change-Number: 47240
Gerrit-PatchSet: 1
Gerrit-Owner: Michael Boyer <[email protected]>
Gerrit-MessageType: newchange

_______________________________________________
gem5-dev mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-dev] Change in gem5/gem5[develop]: arch-vega: Implement non-carry-out VEGA add, sub, and subrev

Reply via email to