Richard Henderson <richard.hender...@linaro.org> writes:
> We still need two different helpers, since NEON and SVE2 get the > inputs from different locations within the source vector. However, > we can convert both to the same internal form for computation. > > The sve2 helper is not used yet, but adding it with this patch > helps illustrate why the neon changes are helpful. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> Reviewed-by: Alex Bennée <alex.ben...@linaro.org> Tested-by: Alex Bennée <alex.ben...@linaro.org> > --- > target/arm/helper-sve.h | 2 ++ > target/arm/helper.h | 3 +- > target/arm/neon_helper.c | 32 -------------------- > target/arm/translate-a64.c | 27 +++++++++++------ > target/arm/translate.c | 26 ++++++++--------- > target/arm/vec_helper.c | 60 ++++++++++++++++++++++++++++++++++++++ > 6 files changed, 95 insertions(+), 55 deletions(-) > > diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h > index 9e79182ab4..2f47279155 100644 > --- a/target/arm/helper-sve.h > +++ b/target/arm/helper-sve.h > @@ -1574,3 +1574,5 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG, > void, env, ptr, ptr, ptr, tl, i32) > DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG, > void, env, ptr, ptr, ptr, tl, i32) > + > +DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > diff --git a/target/arm/helper.h b/target/arm/helper.h > index d954399b7e..8a8517cf34 100644 > --- a/target/arm/helper.h > +++ b/target/arm/helper.h > @@ -335,7 +335,6 @@ DEF_HELPER_2(neon_sub_u8, i32, i32, i32) > DEF_HELPER_2(neon_sub_u16, i32, i32, i32) > DEF_HELPER_2(neon_mul_u8, i32, i32, i32) > DEF_HELPER_2(neon_mul_u16, i32, i32, i32) > -DEF_HELPER_2(neon_mull_p8, i64, i32, i32) > > DEF_HELPER_2(neon_tst_u8, i32, i32, i32) > DEF_HELPER_2(neon_tst_u16, i32, i32, i32) > @@ -688,6 +687,8 @@ DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, > ptr, ptr, ptr, i32) > DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > > +DEF_HELPER_FLAGS_4(neon_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > + > #ifdef TARGET_AARCH64 > #include "helper-a64.h" > #include "helper-sve.h" > diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c > index 6a107da0e1..c7a8438b42 100644 > --- a/target/arm/neon_helper.c > +++ b/target/arm/neon_helper.c > @@ -1129,38 +1129,6 @@ NEON_VOP(mul_u8, neon_u8, 4) > NEON_VOP(mul_u16, neon_u16, 2) > #undef NEON_FN > > -/* Polynomial multiplication is like integer multiplication except the > - partial products are XORed, not added. */ > -uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) > -{ > - uint64_t result = 0; > - uint64_t mask; > - uint64_t op2ex = op2; > - op2ex = (op2ex & 0xff) | > - ((op2ex & 0xff00) << 8) | > - ((op2ex & 0xff0000) << 16) | > - ((op2ex & 0xff000000) << 24); > - while (op1) { > - mask = 0; > - if (op1 & 1) { > - mask |= 0xffff; > - } > - if (op1 & (1 << 8)) { > - mask |= (0xffffU << 16); > - } > - if (op1 & (1 << 16)) { > - mask |= (0xffffULL << 32); > - } > - if (op1 & (1 << 24)) { > - mask |= (0xffffULL << 48); > - } > - result ^= op2ex & mask; > - op1 = (op1 >> 1) & 0x7f7f7f7f; > - op2ex <<= 1; > - } > - return result; > -} > - > #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 > NEON_VOP(tst_u8, neon_u8, 4) > NEON_VOP(tst_u16, neon_u16, 2) > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 12588d18df..2934e4fc16 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -10483,10 +10483,6 @@ static void handle_3rd_widening(DisasContext *s, int > is_q, int is_u, int size, > gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env, > tcg_passres, tcg_passres); > break; > - case 14: /* PMULL */ > - assert(size == 0); > - gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2); > - break; > default: > g_assert_not_reached(); > } > @@ -10650,11 +10646,21 @@ static void disas_simd_three_reg_diff(DisasContext > *s, uint32_t insn) > handle_3rd_narrowing(s, is_q, is_u, size, opcode, rd, rn, rm); > break; > case 14: /* PMULL, PMULL2 */ > - if (is_u || size == 1 || size == 2) { > + if (is_u) { > unallocated_encoding(s); > return; > } > - if (size == 3) { > + switch (size) { > + case 0: /* PMULL.P8 */ > + if (!fp_access_check(s)) { > + return; > + } > + /* The Q field specifies lo/hi half input for this insn. */ > + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, > + gen_helper_neon_pmull_h); > + break; > + > + case 3: /* PMULL.P64 */ > if (!dc_isar_feature(aa64_pmull, s)) { > unallocated_encoding(s); > return; > @@ -10665,9 +10671,13 @@ static void disas_simd_three_reg_diff(DisasContext > *s, uint32_t insn) > /* The Q field specifies lo/hi half input for this insn. */ > gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, > gen_helper_gvec_pmull_q); > - return; > + break; > + > + default: > + unallocated_encoding(s); > + break; > } > - goto is_widening; > + return; > case 9: /* SQDMLAL, SQDMLAL2 */ > case 11: /* SQDMLSL, SQDMLSL2 */ > case 13: /* SQDMULL, SQDMULL2 */ > @@ -10688,7 +10698,6 @@ static void disas_simd_three_reg_diff(DisasContext > *s, uint32_t insn) > unallocated_encoding(s); > return; > } > - is_widening: > if (!fp_access_check(s)) { > return; > } > diff --git a/target/arm/translate.c b/target/arm/translate.c > index 4e34249672..c3abf130cc 100644 > --- a/target/arm/translate.c > +++ b/target/arm/translate.c > @@ -5873,15 +5873,20 @@ static int disas_neon_data_insn(DisasContext *s, > uint32_t insn) > return 1; > } > > - /* Handle VMULL.P64 (Polynomial 64x64 to 128 bit multiply) > - * outside the loop below as it only performs a single pass. > - */ > - if (op == 14 && size == 2) { > - if (!dc_isar_feature(aa32_pmull, s)) { > - return 1; > + /* Handle polynomial VMULL in a single pass. */ > + if (op == 14) { > + if (size == 0) { > + /* VMULL.P8 */ > + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, > + 0, gen_helper_neon_pmull_h); > + } else { > + /* VMULL.P64 */ > + if (!dc_isar_feature(aa32_pmull, s)) { > + return 1; > + } > + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, > + 0, gen_helper_gvec_pmull_q); > } > - tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, > - 0, gen_helper_gvec_pmull_q); > return 0; > } > > @@ -5959,11 +5964,6 @@ static int disas_neon_data_insn(DisasContext *s, > uint32_t insn) > /* VMLAL, VQDMLAL, VMLSL, VQDMLSL, VMULL, VQDMULL */ > gen_neon_mull(cpu_V0, tmp, tmp2, size, u); > break; > - case 14: /* Polynomial VMULL */ > - gen_helper_neon_mull_p8(cpu_V0, tmp, tmp2); > - tcg_temp_free_i32(tmp2); > - tcg_temp_free_i32(tmp); > - break; > default: /* 15 is RESERVED: caught earlier */ > abort(); > } > diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index 5c1074374e..04b4d7402d 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -1197,3 +1197,63 @@ void HELPER(gvec_pmull_q)(void *vd, void *vn, void > *vm, uint32_t desc) > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > + > +/* > + * 8x8->16 polynomial multiply. > + * > + * The byte inputs are expanded to (or extracted from) half-words. > + * Note that neon and sve2 get the inputs from different positions. > + * This allows 4 bytes to be processed in parallel with uint64_t. > + */ > + > +static uint64_t expand_byte_to_half(uint64_t x) > +{ > + return (x & 0x000000ff) > + | ((x & 0x0000ff00) << 8) > + | ((x & 0x00ff0000) << 16) > + | ((x & 0xff000000) << 24); > +} > + > +static uint64_t pmull_h(uint64_t op1, uint64_t op2) > +{ > + uint64_t result = 0; > + int i; > + > + for (i = 0; i < 8; ++i) { > + uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; > + result ^= op2 & mask; > + op1 >>= 1; > + op2 <<= 1; > + } > + return result; > +} > + > +void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) > +{ > + int hi = simd_data(desc); > + uint64_t *d = vd, *n = vn, *m = vm; > + uint64_t nn = n[hi], mm = m[hi]; > + > + d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); > + nn >>= 32; > + mm >>= 32; > + d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); > + > + clear_tail(d, 16, simd_maxsz(desc)); > +} > + > +#ifdef TARGET_AARCH64 > +void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) > +{ > + int shift = simd_data(desc) * 8; > + intptr_t i, opr_sz = simd_oprsz(desc); > + uint64_t *d = vd, *n = vn, *m = vm; > + > + for (i = 0; i < opr_sz / 8; ++i) { > + uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; > + uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; > + > + d[i] = pmull_h(nn, mm); > + } > +} > +#endif -- Alex Bennée