Attention is currently required from: Richard Cooper.
Hello Richard Cooper,
I'd like you to do a code review.
Please visit
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email
to review the following change.
Change subject: arch-arm: Support Arm SVE Load-Broadcast Octaword
instructions.
......................................................................
arch-arm: Support Arm SVE Load-Broadcast Octaword instructions.
Add support for the Arm SVE Load-Broadcast Octaword (LD1RO{B,H,W,D})
instructions. These are similar to the Load-Broadcast
Quadword (LD1RQ{B,H,W,D}) instructions, but work on a 32-byte memory
segment rather than a 16-byte memory segment. Consequently, the LD1ROx
implementations build on the code for the LD1RQx implementations.
For more information please refer to the "ARM Architecture Reference
Manual Supplement - The Scalable Vector Extension (SVE), for ARMv8-A"
(https://developer.arm.com/architectures/cpu-architecture/a-profile/
docs/arm-architecture-reference-manual-supplement-armv8-a)
Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Reviewed-by: Richard Cooper <richard.coo...@arm.com>
---
M src/arch/arm/isa/formats/sve_2nd_level.isa
M src/arch/arm/isa/insts/sve_mem.isa
2 files changed, 115 insertions(+), 54 deletions(-)
diff --git a/src/arch/arm/isa/formats/sve_2nd_level.isa
b/src/arch/arm/isa/formats/sve_2nd_level.isa
index bb3c8e2..be57223 100644
--- a/src/arch/arm/isa/formats/sve_2nd_level.isa
+++ b/src/arch/arm/isa/formats/sve_2nd_level.isa
@@ -2839,9 +2839,9 @@
StaticInstPtr
decodeSveFpFusedMatMulAdd(ExtMachInst machInst)
{
- IntRegIndex zda = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
- IntRegIndex zn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
- IntRegIndex zm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+ RegIndex zda = (RegIndex) (uint8_t) bits(machInst, 4, 0);
+ RegIndex zn = (RegIndex) (uint8_t) bits(machInst, 9, 5);
+ RegIndex zm = (RegIndex) (uint8_t) bits(machInst, 20, 16);
uint8_t size = bits(machInst, 23, 22);
switch (size) {
@@ -3173,66 +3173,96 @@
} // decodeSveMemGather32
StaticInstPtr
- decodeSveLoadBcastQuadSS(ExtMachInst machInst)
+ decodeSveLoadBcastMultiSS(ExtMachInst machInst)
{
- uint8_t num = bits(machInst, 22, 21);
- if (num != 0x00) {
- return new Unknown64(machInst);
- }
-
RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
RegIndex rm = (RegIndex)(uint8_t) bits(machInst, 20, 16);
- uint8_t msz = bits(machInst, 24, 23);
- switch (msz) {
- case 0:
+
+ uint8_t msz_esz = bits(machInst, 24, 21);
+
+ switch (msz_esz) {
+ // Load-Broadcast Quad-word Variants
+ case 0b0000: // 0x0:
return new SveLd1RqSS<uint8_t, uint8_t>("ld1rqb",
machInst, zt, pg, rn, rm);
- case 1:
- return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
+ case 0b0100: // 0x4:
+ return new SveLd1RqSS<uint16_t, uint16_t>("ld1rqh",
machInst, zt, pg, rn, rm);
- case 2:
+ case 0b1000: // 0x8:
return new SveLd1RqSS<uint32_t, uint32_t>("ld1rqw",
machInst, zt, pg, rn, rm);
- case 3:
+ case 0b1100: // 0xc:
return new SveLd1RqSS<uint64_t, uint64_t>("ld1rqd",
machInst, zt, pg, rn, rm);
+
+ // Load-Broadcast Octa-word Variants
+ case 0b0001: // 0x1:
+ return new SveLd1RoSS<uint8_t, uint8_t>("ld1rqb",
+ machInst, zt, pg, rn, rm);
+ case 0b0101: // 0x5:
+ return new SveLd1RoSS<uint16_t, uint16_t>("ld1rqh",
+ machInst, zt, pg, rn, rm);
+ case 0b1001: // 0x9:
+ return new SveLd1RoSS<uint32_t, uint32_t>("ld1rqw",
+ machInst, zt, pg, rn, rm);
+ case 0b1101: // 0xd:
+ return new SveLd1RoSS<uint64_t, uint64_t>("ld1rqd",
+ machInst, zt, pg, rn, rm);
+
+ default:
+ return new Unknown64(machInst);
}
return new Unknown64(machInst);
- } // decodeSveLoadBcastQuadSS
+ } // decodeSveLoadBcastMultiSS
StaticInstPtr
- decodeSveLoadBcastQuadSI(ExtMachInst machInst)
+ decodeSveLoadBcastMultiSI(ExtMachInst machInst)
{
- uint8_t num = bits(machInst, 22, 21);
- if (num != 0x00) {
- return new Unknown64(machInst);
- }
-
RegIndex zt = (RegIndex)(uint8_t) bits(machInst, 4, 0);
RegIndex rn = makeSP((RegIndex)(uint8_t) bits(machInst, 9, 5));
RegIndex pg = (RegIndex)(uint8_t) bits(machInst, 12, 10);
uint64_t imm = sext<4>(bits(machInst, 19, 16));
- uint8_t msz = bits(machInst, 24, 23);
- switch (msz) {
- case 0:
+
+ uint8_t msz_esz = bits(machInst, 24, 21);
+
+ switch (msz_esz) {
+ // Load-Broadcast Quad-word Variants
+ case 0b0000: // 0x0:
return new SveLd1RqSI<uint8_t, uint8_t>("ld1rqb",
machInst, zt, pg, rn, imm);
- case 1:
+ case 0b0100: // 0x4:
return new SveLd1RqSI<uint16_t, uint16_t>("ld1rqh",
machInst, zt, pg, rn, imm);
- case 2:
+ case 0b1000: // 0x8:
return new SveLd1RqSI<uint32_t, uint32_t>("ld1rqw",
machInst, zt, pg, rn, imm);
- case 3:
+ case 0b1100: // 0xc:
return new SveLd1RqSI<uint64_t, uint64_t>("ld1rqd",
machInst, zt, pg, rn, imm);
+
+ // Load-Broadcast Octa-word Variants
+ case 0b0001: // 0x1:
+ return new SveLd1RoSI<uint8_t, uint8_t>("ld1rqb",
+ machInst, zt, pg, rn, imm);
+ case 0b0101: // 0x5:
+ return new SveLd1RoSI<uint16_t, uint16_t>("ld1rqh",
+ machInst, zt, pg, rn, imm);
+ case 0b1001: // 0x9:
+ return new SveLd1RoSI<uint32_t, uint32_t>("ld1rqw",
+ machInst, zt, pg, rn, imm);
+ case 0b1101: // 0xd:
+ return new SveLd1RoSI<uint64_t, uint64_t>("ld1rqd",
+ machInst, zt, pg, rn, imm);
+
+ default:
+ return new Unknown64(machInst);
}
return new Unknown64(machInst);
- } // decodeSveLoadBcastQuadSI
+ } // decodeSveLoadBcastMultiSI
StaticInstPtr
decodeSveContigLoadSS(ExtMachInst machInst)
@@ -3342,10 +3372,10 @@
{
switch (bits(machInst, 15, 13)) {
case 0x0:
- return decodeSveLoadBcastQuadSS(machInst);
+ return decodeSveLoadBcastMultiSS(machInst);
case 0x1:
if (bits(machInst, 20) == 0x0) {
- return decodeSveLoadBcastQuadSI(machInst);
+ return decodeSveLoadBcastMultiSI(machInst);
}
break;
case 0x2:
diff --git a/src/arch/arm/isa/insts/sve_mem.isa
b/src/arch/arm/isa/insts/sve_mem.isa
index 8a73d13..bece368 100644
--- a/src/arch/arm/isa/insts/sve_mem.isa
+++ b/src/arch/arm/isa/insts/sve_mem.isa
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019 ARM Limited
+// Copyright (c) 2017-2020 ARM Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
@@ -1480,20 +1480,33 @@
exec_output += SveStructMemExecDeclare.subst(substDict)
# Generates definitions for SVE load-and-replicate quadword
instructions
- def emitSveLoadAndReplQuad(offsetIsImm):
+ def emitSveLoadAndReplMulti(offsetIsImm, numQwordSegments):
global header_output, exec_output, decoders
+ assert(numQwordSegments in (1, 2)) # Quadword or Octaword
+ from collections import namedtuple
+ InstConfig = namedtuple("_InstConfig", "mnemonic classname
baseclass")
+ INST_CONFIGURATIONS = {
+ # (offsetIsImm, numQwordSegments) -> InstConfig Recors
+ (True, 1): InstConfig("ld1rq", "SveLd1RqSI", "SveContigMemSI"),
+ (False, 1):
InstConfig("ld1rq", "SveLd1RqSS", "SveContigMemSS"),
+ (True, 2): InstConfig("ld1ro", "SveLd1RoSI", "SveContigMemSI"),
+ (False, 2):
InstConfig("ld1ro", "SveLd1RoSS", "SveContigMemSS"),
+ }
+ inst_config = INST_CONFIGURATIONS[(offsetIsImm, numQwordSegments)]
+ memAccessSize = numQwordSegments * 16;
tplHeader = 'template <class RegElemType, class MemElemType>'
tplArgs = '<RegElemType, MemElemType>'
eaCode = SPAlignmentCheckCode + '''
- int memAccessSize = 16;
- EA = XBase + '''
+ int memAccessSize = %(memAccessSize)d;
+ EA = XBase + ''' % dict(memAccessSize=memAccessSize)
if offsetIsImm:
- eaCode += '(((int64_t) this->imm) * 16);'
+ eaCode += ('(((int64_t) this->imm) * %(memAccessSize)d);'
+ % dict(memAccessSize=memAccessSize))
else:
eaCode += '(XOffset * sizeof(MemElemType));'
loadRdEnableCode = '''
- eCount = 16/sizeof(RegElemType);
- auto rdEn = std::vector<bool>(16, true);
+ eCount = %(memAccessSize)d/sizeof(RegElemType);
+ auto rdEn = std::vector<bool>(%(memAccessSize)d, true);
for (int i = 0; i < eCount; ++i) {
if (!GpOp_x[i]) {
for (int j = 0; j < sizeof(RegElemType); ++j) {
@@ -1501,26 +1514,40 @@
}
}
}
- '''
+ ''' % dict(memAccessSize=memAccessSize)
memAccCode = '''
- __uint128_t qword;
- RegElemType* qp = reinterpret_cast<RegElemType*>(&qword);
- for (int i = 0; i < 16/sizeof(RegElemType); ++i) {
+ // Copy active elements of the data from memory into a temporary
+ // quadword/octaword
+ __uint128_t qwords[%(numQwordSegments)d];
+ eCount = %(memAccessSize)d/sizeof(RegElemType);
+ RegElemType* qp = reinterpret_cast<RegElemType*>(&qwords);
+ for (int i = 0; i < eCount; ++i) {
if (GpOp_x[i]) {
qp[i] = memDataView[i];
} else {
qp[i] = 0;
}
}
- eCount = ArmStaticInst::getCurSveVecLen<__uint128_t>(
+ // Repeat the temporary quadword/octaword segment into the
+ // vector register. Zero fill the remainder for non-full
+ // octawords.
+ unsigned numQwords = ArmStaticInst::getCurSveVecLen<__uint128_t>(
xc->tcBase());
- for (int i = 0; i < eCount; ++i) {
- AA64FpDest_uq[i] = qword;
+ unsigned numFullQwords = numQwords -
+ (numQwords %% %(numQwordSegments)d);
+ for (int i = 0; i < numQwords; ++i) {
+ if (i < numFullQwords) {
+ AA64FpDest_uq[i] = qwords[i %% %(numQwordSegments)d];
+ } else {
+ AA64FpDest_uq[i] = 0;
+ }
}
- '''
- iop = ArmInstObjParams('ld1rq',
- 'SveLd1RqSI' if offsetIsImm else 'SveLd1RqSS',
- 'SveContigMemSI' if offsetIsImm else 'SveContigMemSS',
+ ''' % dict(memAccessSize=memAccessSize,
+ numQwordSegments=numQwordSegments)
+ iop = ArmInstObjParams(
+ inst_config.mnemonic,
+ inst_config.classname,
+ inst_config.baseclass,
{'tpl_header': tplHeader,
'tpl_args': tplArgs,
'rden_code': loadRdEnableCode,
@@ -1539,8 +1566,7 @@
SveContigLoadCompleteAcc.subst(iop))
for ttype in ('uint8_t', 'uint16_t', 'uint32_t', 'uint64_t'):
substDict = {'tpl_args': '<%s, %s>' % (ttype, ttype),
- 'class_name': 'SveLd1RqSI' if offsetIsImm
- else 'SveLd1RqSS'}
+ 'class_name': inst_config.classname}
exec_output += SveContigMemExecDeclare.subst(substDict)
# LD1[S]{B,H,W,D} (scalar plus immediate)
@@ -1556,9 +1582,14 @@
emitSveLoadAndRepl()
# LD1RQ{B,H,W,D} (scalar plus immediate)
- emitSveLoadAndReplQuad(offsetIsImm = True)
+ emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=1)
# LD1RQ{B,H,W,D} (scalar plus scalar)
- emitSveLoadAndReplQuad(offsetIsImm = False)
+ emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=1)
+
+ # LD1RO{B,H,W,D} (scalar plus immediate)
+ emitSveLoadAndReplMulti(offsetIsImm=True, numQwordSegments=2)
+ # LD1RO{B,H,W,D} (scalar plus scalar)
+ emitSveLoadAndReplMulti(offsetIsImm=False, numQwordSegments=2)
# LD{2,3,4}{B,H,W,D} (scalar plus immediate)
# ST{2,3,4}{B,H,W,D} (scalar plus immediate)
--
To view, visit
https://gem5-review.googlesource.com/c/public/gem5/+/70727?usp=email
To unsubscribe, or for help writing mail filters, visit
https://gem5-review.googlesource.com/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I98ee4f56c8099bf40c9034baa488d318ae57d3aa
Gerrit-Change-Number: 70727
Gerrit-PatchSet: 1
Gerrit-Owner: Giacomo Travaglini <giacomo.travagl...@arm.com>
Gerrit-Reviewer: Richard Cooper <richard.coo...@arm.com>
Gerrit-Attention: Richard Cooper <richard.coo...@arm.com>
_______________________________________________
gem5-dev mailing list -- gem5-dev@gem5.org
To unsubscribe send an email to gem5-dev-le...@gem5.org