Those are always naturally aligned, so cannot cross a page boundary, thus instead of generating two 8-byte loads with translation on each (and double swap for LE on LE), we use a helper that will do a single translation and memcpy the result over (or do appropriate swapping if needed).
Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org> --- target-ppc/helper.h | 2 ++ target-ppc/mem_helper.c | 60 +++++++++++++++++++++++++++++++++++++++++ target-ppc/translate/vmx-impl.c | 38 ++++++++------------------ target-ppc/translate/vmx-ops.c | 4 +-- 4 files changed, 75 insertions(+), 29 deletions(-) diff --git a/target-ppc/helper.h b/target-ppc/helper.h index 9e4bb7b..3b6e6e6 100644 --- a/target-ppc/helper.h +++ b/target-ppc/helper.h @@ -272,9 +272,11 @@ DEF_HELPER_5(vmsumshm, void, env, avr, avr, avr, avr) DEF_HELPER_5(vmsumshs, void, env, avr, avr, avr, avr) DEF_HELPER_4(vmladduhm, void, avr, avr, avr, avr) DEF_HELPER_2(mtvscr, void, env, avr) +DEF_HELPER_3(lvx, void, env, i32, tl) DEF_HELPER_3(lvebx, void, env, avr, tl) DEF_HELPER_3(lvehx, void, env, avr, tl) DEF_HELPER_3(lvewx, void, env, avr, tl) +DEF_HELPER_3(stvx, void, env, i32, tl) DEF_HELPER_3(stvebx, void, env, avr, tl) DEF_HELPER_3(stvehx, void, env, avr, tl) DEF_HELPER_3(stvewx, void, env, avr, tl) diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c index 6548715..da3f973 100644 --- a/target-ppc/mem_helper.c +++ b/target-ppc/mem_helper.c @@ -225,6 +225,66 @@ target_ulong helper_lscbx(CPUPPCState *env, target_ulong addr, uint32_t reg, #define LO_IDX 0 #endif +void helper_lvx(CPUPPCState *env, uint32_t vr, target_ulong addr) +{ + uintptr_t raddr = GETPC(); + ppc_avr_t *haddr; + + /* Align address */ + addr &= ~(target_ulong)0xf; + + /* Try fast path translate */ + haddr = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, env->dmmu_idx); + if (haddr) { + if (msr_le && HI_IDX) { + memcpy(&env->avr[vr], haddr, 16); + } else { + env->avr[vr].u64[LO_IDX] = bswap64(haddr->u64[HI_IDX]); + env->avr[vr].u64[HI_IDX] = bswap64(haddr->u64[LO_IDX]); + } + } else { + if (needs_byteswap(env)) { + env->avr[vr].u64[LO_IDX] = + bswap64(cpu_ldq_data_ra(env, addr, raddr)); + env->avr[vr].u64[HI_IDX] = + bswap64(cpu_ldq_data_ra(env, addr + 8, raddr)); + } else { + env->avr[vr].u64[HI_IDX] = cpu_ldq_data_ra(env, addr, raddr); + env->avr[vr].u64[LO_IDX] = cpu_ldq_data_ra(env, addr + 8, raddr); + } + } +} + +void helper_stvx(CPUPPCState *env, uint32_t vr, target_ulong addr) +{ + uintptr_t raddr = GETPC(); + ppc_avr_t *haddr; + + /* Align address */ + addr &= ~(target_ulong)0xf; + + /* Try fast path translate */ + haddr = tlb_vaddr_to_host(env, addr, MMU_DATA_STORE, env->dmmu_idx); + if (haddr) { + if (msr_le && HI_IDX) { + memcpy(haddr, &env->avr[vr], 16); + } else { + haddr->u64[LO_IDX] = bswap64(env->avr[vr].u64[HI_IDX]); + haddr->u64[HI_IDX] = bswap64(env->avr[vr].u64[LO_IDX]); + } + } else { + if (needs_byteswap(env)) { + cpu_stq_data_ra(env, addr, + bswap64(env->avr[vr].u64[LO_IDX]), raddr); + cpu_stq_data_ra(env, addr + 8, + bswap64(env->avr[vr].u64[HI_IDX]), raddr); + } else { + cpu_stq_data_ra(env, addr, env->avr[vr].u64[HI_IDX], raddr); + cpu_stq_data_ra(env, addr + 8, env->avr[vr].u64[LO_IDX], raddr); + } + } +} + /* We use msr_le to determine index ordering in a vector. However, byteswapping is not simply controlled by msr_le. We also need to take into account endianness of the target. This is done for the little-endian diff --git a/target-ppc/translate/vmx-impl.c b/target-ppc/translate/vmx-impl.c index 110e19c..a58aa0c 100644 --- a/target-ppc/translate/vmx-impl.c +++ b/target-ppc/translate/vmx-impl.c @@ -15,55 +15,39 @@ static inline TCGv_ptr gen_avr_ptr(int reg) } #define GEN_VR_LDX(name, opc2, opc3) \ -static void glue(gen_, name)(DisasContext *ctx) \ +static void glue(gen_, name)(DisasContext *ctx) \ { \ TCGv EA; \ + TCGv_i32 t0; \ if (unlikely(!ctx->altivec_enabled)) { \ gen_exception(ctx, POWERPC_EXCP_VPU); \ return; \ } \ gen_set_access_type(ctx, ACCESS_INT); \ EA = tcg_temp_new(); \ + t0 = tcg_const_i32(rD(ctx->opcode)); \ gen_addr_reg_index(ctx, EA); \ - tcg_gen_andi_tl(EA, EA, ~0xf); \ - /* We only need to swap high and low halves. gen_qemu_ld64 does necessary \ - 64-bit byteswap already. */ \ - if (ctx->le_mode) { \ - gen_qemu_ld64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \ - tcg_gen_addi_tl(EA, EA, 8); \ - gen_qemu_ld64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \ - } else { \ - gen_qemu_ld64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \ - tcg_gen_addi_tl(EA, EA, 8); \ - gen_qemu_ld64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \ - } \ + gen_helper_lvx(cpu_env, t0, EA); \ tcg_temp_free(EA); \ + tcg_temp_free_i32(t0); \ } #define GEN_VR_STX(name, opc2, opc3) \ static void gen_st##name(DisasContext *ctx) \ { \ TCGv EA; \ + TCGv_i32 t0; \ if (unlikely(!ctx->altivec_enabled)) { \ gen_exception(ctx, POWERPC_EXCP_VPU); \ return; \ } \ gen_set_access_type(ctx, ACCESS_INT); \ EA = tcg_temp_new(); \ + t0 = tcg_const_i32(rD(ctx->opcode)); \ gen_addr_reg_index(ctx, EA); \ - tcg_gen_andi_tl(EA, EA, ~0xf); \ - /* We only need to swap high and low halves. gen_qemu_st64 does necessary \ - 64-bit byteswap already. */ \ - if (ctx->le_mode) { \ - gen_qemu_st64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \ - tcg_gen_addi_tl(EA, EA, 8); \ - gen_qemu_st64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \ - } else { \ - gen_qemu_st64(ctx, cpu_avrh[rD(ctx->opcode)], EA); \ - tcg_gen_addi_tl(EA, EA, 8); \ - gen_qemu_st64(ctx, cpu_avrl[rD(ctx->opcode)], EA); \ - } \ + gen_helper_stvx(cpu_env, t0, EA); \ tcg_temp_free(EA); \ + tcg_temp_free_i32(t0); \ } #define GEN_VR_LVE(name, opc2, opc3, size) \ @@ -116,9 +100,9 @@ GEN_VR_LVE(bx, 0x07, 0x00, 1); GEN_VR_LVE(hx, 0x07, 0x01, 2); GEN_VR_LVE(wx, 0x07, 0x02, 4); -GEN_VR_STX(svx, 0x07, 0x07); +GEN_VR_STX(vx, 0x07, 0x07); /* As we don't emulate the cache, stvxl is stricly equivalent to stvx */ -GEN_VR_STX(svxl, 0x07, 0x0F); +GEN_VR_STX(vxl, 0x07, 0x0F); GEN_VR_STVE(bx, 0x07, 0x04, 1); GEN_VR_STVE(hx, 0x07, 0x05, 2); diff --git a/target-ppc/translate/vmx-ops.c b/target-ppc/translate/vmx-ops.c index b9c982a..6c7d150 100644 --- a/target-ppc/translate/vmx-ops.c +++ b/target-ppc/translate/vmx-ops.c @@ -11,8 +11,8 @@ GEN_VR_LDX(lvxl, 0x07, 0x0B), GEN_VR_LVE(bx, 0x07, 0x00), GEN_VR_LVE(hx, 0x07, 0x01), GEN_VR_LVE(wx, 0x07, 0x02), -GEN_VR_STX(svx, 0x07, 0x07), -GEN_VR_STX(svxl, 0x07, 0x0F), +GEN_VR_STX(vx, 0x07, 0x07), +GEN_VR_STX(vxl, 0x07, 0x0F), GEN_VR_STVE(bx, 0x07, 0x04), GEN_VR_STVE(hx, 0x07, 0x05), GEN_VR_STVE(wx, 0x07, 0x06), -- 2.7.4