Signed-off-by: Stephen Long <stepl...@quicinc.com> I'm guessing endianness doesn't matter because we are writing to the corresponding 32-bit/64-bit in the destination register. --- target/arm/cpu.h | 10 +++++++++ target/arm/helper-sve.h | 3 +++ target/arm/sve.decode | 4 ++++ target/arm/sve_helper.c | 44 ++++++++++++++++++++++++++++++++++++++ target/arm/translate-sve.c | 29 +++++++++++++++++++++++++ 5 files changed, 90 insertions(+)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h index b7c7946771..d41c4a08c0 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -3870,6 +3870,16 @@ static inline bool isar_feature_aa64_sve2_bitperm(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, BITPERM) != 0; } +static inline bool isar_feature_aa64_sve2_f32mm(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F32MM) != 0; +} + +static inline bool isar_feature_aa64_sve2_f64mm(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64zfr0, ID_AA64ZFR0, F64MM) != 0; +} + /* * Feature tests for "does this exist in either 32-bit or 64-bit?" */ diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index ea53750141..8104d23c5f 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -2683,3 +2683,6 @@ DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve2_sqrdcmlah_zzzz_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_6(fmmla_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_6(fmmla_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, ptr, i32) diff --git a/target/arm/sve.decode b/target/arm/sve.decode index 95c73c665a..dd987da648 100644 --- a/target/arm/sve.decode +++ b/target/arm/sve.decode @@ -1383,3 +1383,7 @@ UMLSLT_zzzw 01000100 .. 0 ..... 010 111 ..... ..... @rda_rn_rm CMLA_zzzz 01000100 esz:2 0 rm:5 0010 rot:2 rn:5 rd:5 ra=%reg_movprfx SQRDCMLAH_zzzz 01000100 esz:2 0 rm:5 0011 rot:2 rn:5 rd:5 ra=%reg_movprfx + +### SVE2 floating point matrix multiply accumulate + +FMMLA 01100100 .. 1 ..... 111001 ..... ..... @rda_rn_rm diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index b392a87aef..4646107f2e 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -7389,3 +7389,47 @@ void HELPER(sve2_histseg)(void *vd, void *vn, void *vm, uint32_t desc) *(uint64_t *)(vd + i + 8) = out1; } } + +#define DO_FP_MATRIX_MUL(NAME, TYPE, MUL, ADD) \ +void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ + void *status, uint32_t desc) \ +{ \ + intptr_t s; \ + intptr_t opr_sz = simd_oprsz(desc) / (sizeof(TYPE) >> 2); \ + \ + for (s = 0; s < opr_sz; ++s) { \ + TYPE *n = vn + s * (sizeof(TYPE) >> 2); \ + TYPE *m = vm + s * (sizeof(TYPE) >> 2); \ + TYPE *a = va + s * (sizeof(TYPE) >> 2); \ + TYPE *d = vd + s * (sizeof(TYPE) >> 2); \ + \ + TYPE n00 = n[0], n01 = n[1], n10 = n[2], n11 = n[3]; \ + TYPE m00 = m[0], m01 = m[1], m10 = m[2], m11 = m[3]; \ + TYPE p0, p1, results[4]; \ + \ + /* i = 0, j = 0 */ \ + p0 = MUL(n00, m00, status); \ + p1 = MUL(n01, m01, status); \ + results[0] = ADD(a[0], ADD(p0, p1, status), status); \ + \ + /* i = 0, j = 1 */ \ + p0 = MUL(n00, m10, status); \ + p1 = MUL(n01, m11, status); \ + results[1] = ADD(a[1], ADD(p0, p1, status), status); \ + \ + /* i = 1, j = 0 */ \ + p0 = MUL(n10, m00, status); \ + p1 = MUL(n11, m01, status); \ + results[2] = ADD(a[2], ADD(p0, p1, status), status); \ + \ + /* i = 1, j = 1 */ \ + p0 = MUL(n10, m10, status); \ + p1 = MUL(n11, m11, status); \ + results[3] = ADD(a[3], ADD(p0, p1, status), status); \ + \ + memcpy(d, results, sizeof(TYPE) * 4); \ + } \ +} + +DO_FP_MATRIX_MUL(fmmla_s, float32, float32_mul, float32_add) +DO_FP_MATRIX_MUL(fmmla_d, float64, float64_mul, float64_add) diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index 0cbb35c691..29532424c1 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -7615,6 +7615,35 @@ static bool do_sve2_zzzz_fn(DisasContext *s, int rd, int rn, int rm, int ra, return true; } +static bool trans_FMMLA(DisasContext *s, arg_rrrr_esz *a) +{ + if (a->esz < MO_32) { + return false; + } + + if (a->esz == MO_32 && !dc_isar_feature(aa64_sve2_f32mm, s)) { + return false; + } + + if (a->esz == MO_64 && !dc_isar_feature(aa64_sve2_f64mm, s)) { + return false; + } + + static gen_helper_gvec_4_ptr * const fns[2] = { + gen_helper_fmmla_s, gen_helper_fmmla_d + }; + if (sve_access_check(s)) { + unsigned vsz = vec_full_reg_size(s); + TCGv_ptr status = get_fpstatus_ptr(a->esz == MO_16); + tcg_gen_gvec_4_ptr(vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->ra), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + status, vsz, vsz, 0, fns[a->esz - 2]); + } + return true; +} + static bool do_sqdmlal_zzzw(DisasContext *s, arg_rrrr_esz *a, bool sel1, bool sel2) { -- 2.17.1