On 4/2/19 7:07 PM, Aleksandar Markovic wrote: >> From: Philippe Mathieu-Daudé <phi...@redhat.com> >> Subject: Re: [Qemu-devel] [PATCH v5 2/2] target/mips: Optimize >> ILVEV.<B|H|W|D> MSA instructions >> >> Hi Mateja, >> >> On 4/2/19 5:15 PM, Mateja Marjanovic wrote: >>> From: Mateja Marjanovic <mateja.marjano...@rt-rk.com> >>> >>> Optimize set of MSA instructions ILVEV, using directly >>> tcg registers and performing logic on them instead of >>> using helpers. >> >> Maybe you can still let this previous comment (if still valid): >> >> Performance measurement is done by executing the >> instructions large number of times on a computer >> with Intel Core i7-3770 CPU @ 3.40GHz×8. >> > > Agreed. > >>> >>> In the following table, the first column is the performance >>> before this patch. The second represents the performance, >>> after converting from helpers to tcg, but without using >>> tcg_gen_deposit function. The third one is the solution >>> which is implemented in this patch. >> >> You are describing the "no-deposit" which refers to a previous series >> but won't be accessible in the git repository. >> >> I think this table is useful in the cover of this series, and in the >> commit message you should use || before || after || and drop the >> "no-deposit" case. >> >>> >>> instr || before || no-deposit || with-deposit >>> ======================================================== >>> ilvev.b || 126.92 ms || 24.52 ms || 24.43 ms >>> ilvev.h || 93.67 ms || 23.92 ms || 23.86 ms >>> ilvev.w || 117.86 ms || 23.83 ms || 22.17 ms >>> ilvev.d || 45.49 ms || 19.74 ms || 19.71 ms >>> >>> The solution with deposit is suggested by Richard Henderson. >>> >> > > I think the table should remain in the commit message, to keep it > visible in the git logs. > > You could insert the "no-deposit" source code of gen_ilvev_w() > in the commit message, for reference reasons - it is not a too > large function.
Clever :) > > Thanks, > Aleksandar > >> The gitdm parsable form is: >> >> Suggested-by: Richard Henderson <richard.hender...@linaro.org> >> >>> Signed-off-by: Mateja Marjanovic <mateja.marjano...@rt-rk.com> >>> --- >>> target/mips/helper.h | 1 - >>> target/mips/msa_helper.c | 9 ---- >>> target/mips/translate.c | 105 >>> ++++++++++++++++++++++++++++++++++++++++++++++- >>> 3 files changed, 104 insertions(+), 11 deletions(-) >>> >>> diff --git a/target/mips/helper.h b/target/mips/helper.h >>> index 02e16c7..82f6a40 100644 >>> --- a/target/mips/helper.h >>> +++ b/target/mips/helper.h >>> @@ -864,7 +864,6 @@ DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, >>> i32) >>> DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32) >>> DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32) >>> DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32) >>> -DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32) >>> DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32) >>> DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32) >>> DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32) >>> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c >>> index a7ea6aa..d5c3842 100644 >>> --- a/target/mips/msa_helper.c >>> +++ b/target/mips/msa_helper.c >>> @@ -1197,15 +1197,6 @@ MSA_FN_DF(ilvl_df) >>> } while (0) >>> MSA_FN_DF(ilvr_df) >>> #undef MSA_DO >>> - >>> -#define MSA_DO(DF) \ >>> - do { \ >>> - pwx->DF[2*i] = pwt->DF[2*i]; \ >>> - pwx->DF[2*i+1] = pws->DF[2*i]; \ >>> - } while (0) >>> -MSA_FN_DF(ilvev_df) >>> -#undef MSA_DO >>> - >>> #undef MSA_LOOP_COND >>> >>> #define MSA_LOOP_COND(DF) \ >>> diff --git a/target/mips/translate.c b/target/mips/translate.c >>> index 04406d6..e26c6a6 100644 >>> --- a/target/mips/translate.c >>> +++ b/target/mips/translate.c >>> @@ -28974,6 +28974,94 @@ static inline void gen_ilvod_d(CPUMIPSState *env, >>> uint32_t wd, >>> tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]); >>> } >>> >>> +/* >>> + * [MSA] ILVEV.B wd, ws, wt >>> + * >>> + * Vector Interleave Even (byte data elements) >>> + * >>> + */ >>> +static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd, >>> + uint32_t ws, uint32_t wt) >>> +{ >>> + TCGv_i64 t1 = tcg_temp_new_i64(); >>> + TCGv_i64 t2 = tcg_temp_new_i64(); >>> + const uint64_t mask = 0x00ff00ff00ff00ffULL; >>> + >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >>> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2], mask); >>> + tcg_gen_shli_i64(t2, t2, 8); >>> + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); >>> + >> >> Richard, is it cheaper to use another register to keep the constant mask >> (here reused 4x)? >> >> Such: >> >> TCGv_i64 mask = tcg_const_i64(0x00ff00ff00ff00ffULL); >> >> tcg_gen_and_i64(t1, msa_wr_d[wt * 2], mask); >> tcg_gen_and_i64(t2, msa_wr_d[ws * 2], mask); >> tcg_gen_shli_i64(t2, t2, 8); >> tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); >> >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >>> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2 + 1], mask); >> >> Here use tcg_gen_and_i64() too. >> >>> + tcg_gen_shli_i64(t2, t2, 8); >>> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2); >>> + >> >> tcg_temp_free_i64(mask); >> >>> + tcg_temp_free_i64(t1); >>> + tcg_temp_free_i64(t2); >> >> Mateja: Can you test for perf easily? >> >>> +} >>> + >>> +/* >>> + * [MSA] ILVEV.H wd, ws, wt >>> + * >>> + * Vector Interleave Even (halfword data elements) >>> + * >>> + */ >>> +static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd, >>> + uint32_t ws, uint32_t wt) >>> +{ >>> + TCGv_i64 t1 = tcg_temp_new_i64(); >>> + TCGv_i64 t2 = tcg_temp_new_i64(); >>> + const uint64_t mask = 0x0000ffff0000ffffULL; >>> + >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >>> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2], mask); >>> + tcg_gen_shli_i64(t2, t2, 16); >>> + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); >>> + >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >>> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2 + 1], mask); >>> + tcg_gen_shli_i64(t2, t2, 16); >>> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2); >>> + >>> + tcg_temp_free_i64(t1); >>> + tcg_temp_free_i64(t2); >> >> Very similiar to gen_ilvev_b(), changing the mask and shift. Maybe worth >> a refactor. >> >>> +} >>> + >>> +/* >>> + * [MSA] ILVEV.W wd, ws, wt >>> + * >>> + * Vector Interleave Even (word data elements) >>> + * >>> + */ >>> +static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd, >>> + uint32_t ws, uint32_t wt) >>> +{ >>> + TCGv_i64 t1 = tcg_temp_new_i64(); >>> + const uint64_t mask = 0x00000000ffffffffULL; >>> + >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >>> + tcg_gen_deposit_i64(msa_wr_d[wd * 2], t1, msa_wr_d[ws * 2], 32, 32); >>> + >>> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >>> + tcg_gen_deposit_i64(msa_wr_d[wd * 2 + 1], t1, msa_wr_d[ws * 2 + 1], >>> 32, 32); >>> + >>> + tcg_temp_free_i64(t1); >>> +} >>> + >>> +/* >>> + * [MSA] ILVEV.D wd, ws, wt >>> + * >>> + * Vector Interleave Even (Double data elements) >>> + * >>> + */ >>> +static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd, >>> + uint32_t ws, uint32_t wt) >>> +{ >>> + tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]); >>> + tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]); >>> +} >>> + >>> static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx) >>> { >>> #define MASK_MSA_3R(op) (MASK_MSA_MINOR(op) | (op & (0x7 << 23))) >>> @@ -29130,7 +29218,22 @@ static void gen_msa_3r(CPUMIPSState *env, >>> DisasContext *ctx) >>> gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt); >>> break; >>> case OPC_ILVEV_df: >>> - gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt); >>> + switch (df) { >>> + case DF_BYTE: >>> + gen_ilvev_b(env, wd, ws, wt); >>> + break; >>> + case DF_HALF: >>> + gen_ilvev_h(env, wd, ws, wt); >>> + break; >>> + case DF_WORD: >>> + gen_ilvev_w(env, wd, ws, wt); >>> + break; >>> + case DF_DOUBLE: >>> + gen_ilvev_d(env, wd, ws, wt); >>> + break; >>> + default: >>> + assert(0); >>> + } >>> break; >>> case OPC_BINSR_df: >>> gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt); >>> >> >> Thanks, >> >> Phil. >> > > ________________________________________ > From: Philippe Mathieu-Daudé <phi...@redhat.com> > Sent: Tuesday, April 2, 2019 6:19:19 PM > To: Mateja Marjanovic; qemu-devel@nongnu.org > Cc: Aleksandar Rikalo; richard.hender...@linaro.org; Aleksandar Markovic; > aurel...@aurel32.net > Subject: Re: [Qemu-devel] [PATCH v5 2/2] target/mips: Optimize > ILVEV.<B|H|W|D> MSA instructions > > Hi Mateja, > > On 4/2/19 5:15 PM, Mateja Marjanovic wrote: >> From: Mateja Marjanovic <mateja.marjano...@rt-rk.com> >> >> Optimize set of MSA instructions ILVEV, using directly >> tcg registers and performing logic on them instead of >> using helpers. > > Maybe you can still let this previous comment (if still valid): > > Performance measurement is done by executing the > instructions large number of times on a computer > with Intel Core i7-3770 CPU @ 3.40GHz×8. > >> >> In the following table, the first column is the performance >> before this patch. The second represents the performance, >> after converting from helpers to tcg, but without using >> tcg_gen_deposit function. The third one is the solution >> which is implemented in this patch. > > You are describing the "no-deposit" which refers to a previous series > but won't be accessible in the git repository. > > I think this table is useful in the cover of this series, and in the > commit message you should use || before || after || and drop the > "no-deposit" case. > >> >> instr || before || no-deposit || with-deposit >> ======================================================== >> ilvev.b || 126.92 ms || 24.52 ms || 24.43 ms >> ilvev.h || 93.67 ms || 23.92 ms || 23.86 ms >> ilvev.w || 117.86 ms || 23.83 ms || 22.17 ms >> ilvev.d || 45.49 ms || 19.74 ms || 19.71 ms >> >> The solution with deposit is suggested by Richard Henderson. >> > > The gitdm parsable form is: > > Suggested-by: Richard Henderson <richard.hender...@linaro.org> > >> Signed-off-by: Mateja Marjanovic <mateja.marjano...@rt-rk.com> >> --- >> target/mips/helper.h | 1 - >> target/mips/msa_helper.c | 9 ---- >> target/mips/translate.c | 105 >> ++++++++++++++++++++++++++++++++++++++++++++++- >> 3 files changed, 104 insertions(+), 11 deletions(-) >> >> diff --git a/target/mips/helper.h b/target/mips/helper.h >> index 02e16c7..82f6a40 100644 >> --- a/target/mips/helper.h >> +++ b/target/mips/helper.h >> @@ -864,7 +864,6 @@ DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32) >> -DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32) >> DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32) >> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c >> index a7ea6aa..d5c3842 100644 >> --- a/target/mips/msa_helper.c >> +++ b/target/mips/msa_helper.c >> @@ -1197,15 +1197,6 @@ MSA_FN_DF(ilvl_df) >> } while (0) >> MSA_FN_DF(ilvr_df) >> #undef MSA_DO >> - >> -#define MSA_DO(DF) \ >> - do { \ >> - pwx->DF[2*i] = pwt->DF[2*i]; \ >> - pwx->DF[2*i+1] = pws->DF[2*i]; \ >> - } while (0) >> -MSA_FN_DF(ilvev_df) >> -#undef MSA_DO >> - >> #undef MSA_LOOP_COND >> >> #define MSA_LOOP_COND(DF) \ >> diff --git a/target/mips/translate.c b/target/mips/translate.c >> index 04406d6..e26c6a6 100644 >> --- a/target/mips/translate.c >> +++ b/target/mips/translate.c >> @@ -28974,6 +28974,94 @@ static inline void gen_ilvod_d(CPUMIPSState *env, >> uint32_t wd, >> tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]); >> } >> >> +/* >> + * [MSA] ILVEV.B wd, ws, wt >> + * >> + * Vector Interleave Even (byte data elements) >> + * >> + */ >> +static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd, >> + uint32_t ws, uint32_t wt) >> +{ >> + TCGv_i64 t1 = tcg_temp_new_i64(); >> + TCGv_i64 t2 = tcg_temp_new_i64(); >> + const uint64_t mask = 0x00ff00ff00ff00ffULL; >> + >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2], mask); >> + tcg_gen_shli_i64(t2, t2, 8); >> + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); >> + > > Richard, is it cheaper to use another register to keep the constant mask > (here reused 4x)? > > Such: > > TCGv_i64 mask = tcg_const_i64(0x00ff00ff00ff00ffULL); > > tcg_gen_and_i64(t1, msa_wr_d[wt * 2], mask); > tcg_gen_and_i64(t2, msa_wr_d[ws * 2], mask); > tcg_gen_shli_i64(t2, t2, 8); > tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); > >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2 + 1], mask); > > Here use tcg_gen_and_i64() too. > >> + tcg_gen_shli_i64(t2, t2, 8); >> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2); >> + > > tcg_temp_free_i64(mask); > >> + tcg_temp_free_i64(t1); >> + tcg_temp_free_i64(t2); > > Mateja: Can you test for perf easily? > >> +} >> + >> +/* >> + * [MSA] ILVEV.H wd, ws, wt >> + * >> + * Vector Interleave Even (halfword data elements) >> + * >> + */ >> +static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd, >> + uint32_t ws, uint32_t wt) >> +{ >> + TCGv_i64 t1 = tcg_temp_new_i64(); >> + TCGv_i64 t2 = tcg_temp_new_i64(); >> + const uint64_t mask = 0x0000ffff0000ffffULL; >> + >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2], mask); >> + tcg_gen_shli_i64(t2, t2, 16); >> + tcg_gen_or_i64(msa_wr_d[wd * 2], t1, t2); >> + >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >> + tcg_gen_andi_i64(t2, msa_wr_d[ws * 2 + 1], mask); >> + tcg_gen_shli_i64(t2, t2, 16); >> + tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t1, t2); >> + >> + tcg_temp_free_i64(t1); >> + tcg_temp_free_i64(t2); > > Very similiar to gen_ilvev_b(), changing the mask and shift. Maybe worth > a refactor. > >> +} >> + >> +/* >> + * [MSA] ILVEV.W wd, ws, wt >> + * >> + * Vector Interleave Even (word data elements) >> + * >> + */ >> +static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd, >> + uint32_t ws, uint32_t wt) >> +{ >> + TCGv_i64 t1 = tcg_temp_new_i64(); >> + const uint64_t mask = 0x00000000ffffffffULL; >> + >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask); >> + tcg_gen_deposit_i64(msa_wr_d[wd * 2], t1, msa_wr_d[ws * 2], 32, 32); >> + >> + tcg_gen_andi_i64(t1, msa_wr_d[wt * 2 + 1], mask); >> + tcg_gen_deposit_i64(msa_wr_d[wd * 2 + 1], t1, msa_wr_d[ws * 2 + 1], 32, >> 32); >> + >> + tcg_temp_free_i64(t1); >> +} >> + >> +/* >> + * [MSA] ILVEV.D wd, ws, wt >> + * >> + * Vector Interleave Even (Double data elements) >> + * >> + */ >> +static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd, >> + uint32_t ws, uint32_t wt) >> +{ >> + tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]); >> + tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]); >> +} >> + >> static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx) >> { >> #define MASK_MSA_3R(op) (MASK_MSA_MINOR(op) | (op & (0x7 << 23))) >> @@ -29130,7 +29218,22 @@ static void gen_msa_3r(CPUMIPSState *env, >> DisasContext *ctx) >> gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt); >> break; >> case OPC_ILVEV_df: >> - gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt); >> + switch (df) { >> + case DF_BYTE: >> + gen_ilvev_b(env, wd, ws, wt); >> + break; >> + case DF_HALF: >> + gen_ilvev_h(env, wd, ws, wt); >> + break; >> + case DF_WORD: >> + gen_ilvev_w(env, wd, ws, wt); >> + break; >> + case DF_DOUBLE: >> + gen_ilvev_d(env, wd, ws, wt); >> + break; >> + default: >> + assert(0); >> + } >> break; >> case OPC_BINSR_df: >> gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt); >> > > Thanks, > > Phil. >