On Thursday, October 17, 2019, Stefan Brankovic <stefan.branko...@rt-rk.com> wrote:
> Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword). > This instruction counts the number of leading zeros of each halfword > element > in source register and places result in the appropriate halfword element of > destination register. > > In each iteration of outer for loop count operation is performed on one > doubleword element of source register vB. In the first iteration, higher > doubleword element of vB is placed in variable avr, and then counting > for every halfword element is performed by using tcg_gen_clzi_i64. > Since it counts leading zeros on 64 bit lenght, ith byte element has to > be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all > ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result > in appropriate halfword element of result. This is done in inner for loop. > After the operation is finished, the result is saved in the appropriate > doubleword element of destination register vD. The same sequence of orders > is to be applied again for the lower doubleword element of vB. > > Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte). > This instruction counts the number of leading zeros of each byte element > in source register and places result in the appropriate byte element of > destination register. > > In each iteration of the outer for loop, counting operation is done on one > doubleword element of source register vB. In the first iteration, the > higher doubleword element of vB is placed in variable avr, and then > counting > for every byte element is performed using tcg_gen_clzi_i64. Since it counts > leading zeros on 64 bit lenght, ith byte element has to be moved to the > highest > 8 bits of variable tmp, ored with mask(in order to get all ones in the > lowest > 56 bits), then perform tcg_gen_clzi_i64 and move it's result in the > appropriate > byte element of result. This is done in inner for loop. After the > operation is > finished, the result is saved in the appropriate doubleword element of > destination > register vD. The same sequence of orders is to be applied again for the > lower > doubleword element of vB. > > The same hints as for the commit message of patch 2/3. Additionally, the first and the third paragraph should be merged into a single one at the beggining of the commit message Signed-off-by: Stefan Brankovic <stefan.branko...@rt-rk.com> > --- > target/ppc/helper.h | 2 - > target/ppc/int_helper.c | 9 --- > target/ppc/translate/vmx-impl.inc.c | 136 ++++++++++++++++++++++++++++++ > +++++- > 3 files changed, 134 insertions(+), 13 deletions(-) > > diff --git a/target/ppc/helper.h b/target/ppc/helper.h > index f843814..281e54f 100644 > --- a/target/ppc/helper.h > +++ b/target/ppc/helper.h > @@ -308,8 +308,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32) > DEF_HELPER_4(vctuxs, void, env, avr, avr, i32) > DEF_HELPER_4(vctsxs, void, env, avr, avr, i32) > > -DEF_HELPER_2(vclzb, void, avr, avr) > -DEF_HELPER_2(vclzh, void, avr, avr) > DEF_HELPER_2(vctzb, void, avr, avr) > DEF_HELPER_2(vctzh, void, avr, avr) > DEF_HELPER_2(vctzw, void, avr, avr) > diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c > index 6d238b9..cd00f5e 100644 > --- a/target/ppc/int_helper.c > +++ b/target/ppc/int_helper.c > @@ -1817,15 +1817,6 @@ VUPK(lsw, s64, s32, UPKLO) > } \ > } > > -#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8) > -#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16) > - > -VGENERIC_DO(clzb, u8) > -VGENERIC_DO(clzh, u16) > - > -#undef clzb > -#undef clzh > - > #define ctzb(v) ((v) ? ctz32(v) : 8) > #define ctzh(v) ((v) ? ctz32(v) : 16) > #define ctzw(v) ctz32((v)) > diff --git a/target/ppc/translate/vmx-impl.inc.c > b/target/ppc/translate/vmx-impl.inc.c > index 2472a52..a428ef3 100644 > --- a/target/ppc/translate/vmx-impl.inc.c > +++ b/target/ppc/translate/vmx-impl.inc.c > @@ -751,6 +751,138 @@ static void trans_vgbbd(DisasContext *ctx) > } > > /* > + * vclzb VRT,VRB - Vector Count Leading Zeros Byte > + * > + * Counting the number of leading zero bits of each byte element in source > + * register and placing result in appropriate byte element of destination > + * register. > + */ > +static void trans_vclzb(DisasContext *ctx) > +{ > + int VT = rD(ctx->opcode); > + int VB = rB(ctx->opcode); > + TCGv_i64 avr = tcg_temp_new_i64(); > + TCGv_i64 result = tcg_temp_new_i64(); > + TCGv_i64 result1 = tcg_temp_new_i64(); > + TCGv_i64 result2 = tcg_temp_new_i64(); > + TCGv_i64 tmp = tcg_temp_new_i64(); > + TCGv_i64 mask = tcg_const_i64(0xffffffffffffffULL); > + int i, j; > + > + for (i = 0; i < 2; i++) { > + if (i == 0) { > + /* Get high doubleword of vB in 'avr'. */ > + get_avr64(avr, VB, true); > + } else { > + /* Get low doubleword of vB in 'avr'. */ > + get_avr64(avr, VB, false); > + } > + /* > + * Perform count for every byte element using 'tcg_gen_clzi_i64'. > + * Since it counts leading zeros on 64 bit lenght, we have to move > + * ith byte element to highest 8 bits of 'tmp', or it with > mask(so we > + * get all ones in lowest 56 bits), then perform > 'tcg_gen_clzi_i64' and > + * move it's result in appropriate byte element of result. > + */ > + tcg_gen_shli_i64(tmp, avr, 56); before this line, insert a blank line and the comment: /* count leading zeroes for bits 0..8 */ > + tcg_gen_or_i64(tmp, tmp, mask); > + tcg_gen_clzi_i64(result, tmp, 64); > + for (j = 1; j < 7; j++) { > + tcg_gen_shli_i64(tmp, avr, (7 - j) * 8); before this line, insert comment: /* count leading zeroes for bits 8*j..8*j+7 */ > + tcg_gen_or_i64(tmp, tmp, mask); > + tcg_gen_clzi_i64(tmp, tmp, 64); > + tcg_gen_deposit_i64(result, result, tmp, j * 8, 8); > + } > + tcg_gen_or_i64(tmp, avr, mask); before this line, insert comment: /* count leading zeroes for bits 56..63 */ > + tcg_gen_clzi_i64(tmp, tmp, 64); > + tcg_gen_deposit_i64(result, result, tmp, 56, 8); + if (i == 0) { > + /* Place result in high doubleword element of vD. */ > + tcg_gen_mov_i64(result1, result); > + } else { > + /* Place result in low doubleword element of vD. */ > + tcg_gen_mov_i64(result2, result); > + } > + } > + > + set_avr64(VT, result1, true); > + set_avr64(VT, result2, false); > + > + tcg_temp_free_i64(avr); > + tcg_temp_free_i64(result); > + tcg_temp_free_i64(result1); > + tcg_temp_free_i64(result2); > + tcg_temp_free_i64(tmp); > + tcg_temp_free_i64(mask); > +} > + > +/* > + * vclzh VRT,VRB - Vector Count Leading Zeros Halfword Similar hints as in the case of the previous function. > + * > + * Counting the number of leading zero bits of each halfword element in > source > + * register and placing result in appropriate halfword element of > destination > + * register. > + */ > +static void trans_vclzh(DisasContext *ctx) > +{ > + int VT = rD(ctx->opcode); > + int VB = rB(ctx->opcode); > + TCGv_i64 avr = tcg_temp_new_i64(); > + TCGv_i64 result = tcg_temp_new_i64(); > + TCGv_i64 result1 = tcg_temp_new_i64(); > + TCGv_i64 result2 = tcg_temp_new_i64(); > + TCGv_i64 tmp = tcg_temp_new_i64(); > + TCGv_i64 mask = tcg_const_i64(0xffffffffffffULL); > + int i, j; > + > + for (i = 0; i < 2; i++) { > + if (i == 0) { > + /* Get high doubleword element of vB in 'avr'. */ > + get_avr64(avr, VB, true); > + } else { > + /* Get low doubleword element of vB in 'avr'. */ > + get_avr64(avr, VB, false); > + } > + /* > + * Perform count for every halfword element using > 'tcg_gen_clzi_i64'. > + * Since it counts leading zeros on 64 bit lenght, we have to move > + * ith byte element to highest 16 bits of 'tmp', or it with > mask(so we > + * get all ones in lowest 48 bits), then perform > 'tcg_gen_clzi_i64' and > + * move it's result in appropriate halfword element of result. > + */ > + tcg_gen_shli_i64(tmp, avr, 48); > + tcg_gen_or_i64(tmp, tmp, mask); > + tcg_gen_clzi_i64(result, tmp, 64); > + for (j = 1; j < 3; j++) { > + tcg_gen_shli_i64(tmp, avr, (3 - j) * 16); > + tcg_gen_or_i64(tmp, tmp, mask); > + tcg_gen_clzi_i64(tmp, tmp, 64); > + tcg_gen_deposit_i64(result, result, tmp, j * 16, 16); > + } > + tcg_gen_or_i64(tmp, avr, mask); > + tcg_gen_clzi_i64(tmp, tmp, 64); > + tcg_gen_deposit_i64(result, result, tmp, 48, 16); > + if (i == 0) { > + /* Place result in high doubleword element of vD. */ > + tcg_gen_mov_i64(result1, result); > + } else { > + /* Place result in low doubleword element of vD. */ > + tcg_gen_mov_i64(result2, result); > + } > + } > + > + set_avr64(VT, result1, true); > + set_avr64(VT, result2, false); > + > + tcg_temp_free_i64(avr); > + tcg_temp_free_i64(result); > + tcg_temp_free_i64(result1); > + tcg_temp_free_i64(result2); > + tcg_temp_free_i64(tmp); > + tcg_temp_free_i64(mask); > +} > + > +/* > * vclzw VRT,VRB - Vector Count Leading Zeros Word > * > * Counting the number of leading zero bits of each word element in source > @@ -1315,8 +1447,8 @@ GEN_VAFORM_PAIRED(vmsumshm, vmsumshs, 20) > GEN_VAFORM_PAIRED(vsel, vperm, 21) > GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23) > > -GEN_VXFORM_NOA(vclzb, 1, 28) > -GEN_VXFORM_NOA(vclzh, 1, 29) > +GEN_VXFORM_TRANS(vclzb, 1, 28) > +GEN_VXFORM_TRANS(vclzh, 1, 29) > GEN_VXFORM_TRANS(vclzw, 1, 30) > GEN_VXFORM_TRANS(vclzd, 1, 31) > GEN_VXFORM_NOA_2(vnegw, 1, 24, 6) > -- > 2.7.4 > > >