ppc: Optimize emulation of vclzh and vclzb instructions

Aleksandar Markovic Sat, 19 Oct 2019 12:52:23 -0700

On Thursday, October 17, 2019, Stefan Brankovic <stefan.branko...@rt-rk.com>
wrote:


> Optimize Altivec instruction vclzh (Vector Count Leading Zeros Halfword).
> This instruction counts the number of leading zeros of each halfword
> element
> in source register and places result in the appropriate halfword element of
> destination register.
>
> In each iteration of outer for loop count operation is performed on one
> doubleword element of source register vB. In the first iteration, higher
> doubleword element of vB is placed in variable avr, and then counting
> for every halfword element is performed by  using tcg_gen_clzi_i64.
> Since it counts leading zeros on 64 bit lenght, ith byte element has to
> be moved to the highest 16 bits of tmp, or-ed with mask(in order to get all
> ones in lowest 48 bits), then perform tcg_gen_clzi_i64 and move it's result
> in appropriate halfword element of result. This is done in inner for loop.
> After the operation is finished, the result is saved in the appropriate
> doubleword element of destination register vD. The same sequence of orders
> is to be applied again for the  lower doubleword element of vB.
>
> Optimize Altivec instruction vclzb (Vector Count Leading Zeros Byte).
> This instruction counts the number of leading zeros of each byte element
> in source register and places result in the appropriate byte element of
> destination register.
>
> In each iteration of the outer for loop, counting operation is done on one
> doubleword element of source register vB. In the first iteration, the
> higher doubleword element of vB is placed in variable avr, and then
> counting
> for every byte element is performed using tcg_gen_clzi_i64. Since it counts
> leading zeros on 64 bit lenght, ith byte element has to be moved to the
> highest
> 8 bits of variable  tmp, ored with mask(in order to get all ones in the
> lowest
> 56 bits), then perform tcg_gen_clzi_i64 and move it's result in the
> appropriate
> byte element of result. This is done in inner for loop. After the
> operation is
> finished, the result is saved in the  appropriate doubleword element of
> destination
> register vD. The same sequence of orders is to be applied again for the
> lower
> doubleword element of vB.
>
>
The same hints as for the commit message of patch 2/3.

Additionally, the first and the third paragraph should be merged into a
single one at the beggining of the commit message


Signed-off-by: Stefan Brankovic <stefan.branko...@rt-rk.com>
> ---
>  target/ppc/helper.h                 |   2 -
>  target/ppc/int_helper.c             |   9 ---
>  target/ppc/translate/vmx-impl.inc.c | 136 ++++++++++++++++++++++++++++++
> +++++-
>  3 files changed, 134 insertions(+), 13 deletions(-)
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index f843814..281e54f 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -308,8 +308,6 @@ DEF_HELPER_4(vcfsx, void, env, avr, avr, i32)
>  DEF_HELPER_4(vctuxs, void, env, avr, avr, i32)
>  DEF_HELPER_4(vctsxs, void, env, avr, avr, i32)
>
> -DEF_HELPER_2(vclzb, void, avr, avr)
> -DEF_HELPER_2(vclzh, void, avr, avr)
>  DEF_HELPER_2(vctzb, void, avr, avr)
>  DEF_HELPER_2(vctzh, void, avr, avr)
>  DEF_HELPER_2(vctzw, void, avr, avr)
> diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
> index 6d238b9..cd00f5e 100644
> --- a/target/ppc/int_helper.c
> +++ b/target/ppc/int_helper.c
> @@ -1817,15 +1817,6 @@ VUPK(lsw, s64, s32, UPKLO)
>          }                                                               \
>      }
>
> -#define clzb(v) ((v) ? clz32((uint32_t)(v) << 24) : 8)
> -#define clzh(v) ((v) ? clz32((uint32_t)(v) << 16) : 16)
> -
> -VGENERIC_DO(clzb, u8)
> -VGENERIC_DO(clzh, u16)
> -
> -#undef clzb
> -#undef clzh
> -
>  #define ctzb(v) ((v) ? ctz32(v) : 8)
>  #define ctzh(v) ((v) ? ctz32(v) : 16)
>  #define ctzw(v) ctz32((v))
> diff --git a/target/ppc/translate/vmx-impl.inc.c
> b/target/ppc/translate/vmx-impl.inc.c
> index 2472a52..a428ef3 100644
> --- a/target/ppc/translate/vmx-impl.inc.c
> +++ b/target/ppc/translate/vmx-impl.inc.c
> @@ -751,6 +751,138 @@ static void trans_vgbbd(DisasContext *ctx)
>  }
>
>  /*
> + * vclzb VRT,VRB - Vector Count Leading Zeros Byte
> + *
> + * Counting the number of leading zero bits of each byte element in source
> + * register and placing result in appropriate byte element of destination
> + * register.
> + */
> +static void trans_vclzb(DisasContext *ctx)
> +{
> +    int VT = rD(ctx->opcode);
> +    int VB = rB(ctx->opcode);
> +    TCGv_i64 avr = tcg_temp_new_i64();
> +    TCGv_i64 result = tcg_temp_new_i64();
> +    TCGv_i64 result1 = tcg_temp_new_i64();
> +    TCGv_i64 result2 = tcg_temp_new_i64();
> +    TCGv_i64 tmp = tcg_temp_new_i64();
> +    TCGv_i64 mask = tcg_const_i64(0xffffffffffffffULL);
> +    int i, j;
> +
> +    for (i = 0; i < 2; i++) {
> +        if (i == 0) {
> +            /* Get high doubleword of vB in 'avr'. */
> +            get_avr64(avr, VB, true);
> +        } else {
> +            /* Get low doubleword of vB in 'avr'. */
> +            get_avr64(avr, VB, false);
> +        }
> +        /*
> +         * Perform count for every byte element using 'tcg_gen_clzi_i64'.
> +         * Since it counts leading zeros on 64 bit lenght, we have to move
> +         * ith byte element to highest 8 bits of 'tmp', or it with
> mask(so we
> +         * get all ones in lowest 56 bits), then perform
> 'tcg_gen_clzi_i64' and
> +         * move it's result in appropriate byte element of result.
> +         */
> +        tcg_gen_shli_i64(tmp, avr, 56);


before this line, insert a blank line and the comment: /* count leading
zeroes for bits 0..8 */


> +        tcg_gen_or_i64(tmp, tmp, mask);
> +        tcg_gen_clzi_i64(result, tmp, 64);
> +        for (j = 1; j < 7; j++) {
> +            tcg_gen_shli_i64(tmp, avr, (7 - j) * 8);


before this line, insert comment: /* count leading zeroes for bits
8*j..8*j+7  */


> +            tcg_gen_or_i64(tmp, tmp, mask);
> +            tcg_gen_clzi_i64(tmp, tmp, 64);
> +            tcg_gen_deposit_i64(result, result, tmp, j * 8, 8);
> +        }
> +        tcg_gen_or_i64(tmp, avr, mask);


before this line, insert comment: /* count leading zeroes for bits 56..63
 */


> +        tcg_gen_clzi_i64(tmp, tmp, 64);
> +        tcg_gen_deposit_i64(result, result, tmp, 56, 8);

+        if (i == 0) {
> +            /* Place result in high doubleword element of vD. */
> +            tcg_gen_mov_i64(result1, result);
> +        } else {
> +            /* Place result in low doubleword element of vD. */
> +            tcg_gen_mov_i64(result2, result);
> +        }
> +    }
> +
> +    set_avr64(VT, result1, true);
> +    set_avr64(VT, result2, false);
> +
> +    tcg_temp_free_i64(avr);
> +    tcg_temp_free_i64(result);
> +    tcg_temp_free_i64(result1);
> +    tcg_temp_free_i64(result2);
> +    tcg_temp_free_i64(tmp);
> +    tcg_temp_free_i64(mask);
> +}
> +
> +/*
> + * vclzh VRT,VRB - Vector Count Leading Zeros Halfword


Similar hints as in the case of the previous function.


> + *
> + * Counting the number of leading zero bits of each halfword element in
> source
> + * register and placing result in appropriate halfword element of
> destination
> + * register.
> + */
> +static void trans_vclzh(DisasContext *ctx)
> +{
> +    int VT = rD(ctx->opcode);
> +    int VB = rB(ctx->opcode);
> +    TCGv_i64 avr = tcg_temp_new_i64();
> +    TCGv_i64 result = tcg_temp_new_i64();
> +    TCGv_i64 result1 = tcg_temp_new_i64();
> +    TCGv_i64 result2 = tcg_temp_new_i64();
> +    TCGv_i64 tmp = tcg_temp_new_i64();
> +    TCGv_i64 mask = tcg_const_i64(0xffffffffffffULL);
> +    int i, j;
> +
> +    for (i = 0; i < 2; i++) {
> +        if (i == 0) {
> +            /* Get high doubleword element of vB in 'avr'. */
> +            get_avr64(avr, VB, true);
> +        } else {
> +            /* Get low doubleword element of vB in 'avr'. */
> +            get_avr64(avr, VB, false);
> +        }
> +        /*
> +         * Perform count for every halfword element using
> 'tcg_gen_clzi_i64'.
> +         * Since it counts leading zeros on 64 bit lenght, we have to move
> +         * ith byte element to highest 16 bits of 'tmp', or it with
> mask(so we
> +         * get all ones in lowest 48 bits), then perform
> 'tcg_gen_clzi_i64' and
> +         * move it's result in appropriate halfword element of result.
> +         */
> +        tcg_gen_shli_i64(tmp, avr, 48);
> +        tcg_gen_or_i64(tmp, tmp, mask);
> +        tcg_gen_clzi_i64(result, tmp, 64);
> +        for (j = 1; j < 3; j++) {
> +            tcg_gen_shli_i64(tmp, avr, (3 - j) * 16);
> +            tcg_gen_or_i64(tmp, tmp, mask);
> +            tcg_gen_clzi_i64(tmp, tmp, 64);
> +            tcg_gen_deposit_i64(result, result, tmp, j * 16, 16);
> +        }
> +        tcg_gen_or_i64(tmp, avr, mask);
> +        tcg_gen_clzi_i64(tmp, tmp, 64);
> +        tcg_gen_deposit_i64(result, result, tmp, 48, 16);
> +        if (i == 0) {
> +            /* Place result in high doubleword element of vD. */
> +            tcg_gen_mov_i64(result1, result);
> +        } else {
> +            /* Place result in low doubleword element of vD. */
> +            tcg_gen_mov_i64(result2, result);
> +        }
> +    }
> +
> +    set_avr64(VT, result1, true);
> +    set_avr64(VT, result2, false);
> +
> +    tcg_temp_free_i64(avr);
> +    tcg_temp_free_i64(result);
> +    tcg_temp_free_i64(result1);
> +    tcg_temp_free_i64(result2);
> +    tcg_temp_free_i64(tmp);
> +    tcg_temp_free_i64(mask);
> +}
> +
> +/*
>   * vclzw VRT,VRB - Vector Count Leading Zeros Word
>   *
>   * Counting the number of leading zero bits of each word element in source
> @@ -1315,8 +1447,8 @@ GEN_VAFORM_PAIRED(vmsumshm, vmsumshs, 20)
>  GEN_VAFORM_PAIRED(vsel, vperm, 21)
>  GEN_VAFORM_PAIRED(vmaddfp, vnmsubfp, 23)
>
> -GEN_VXFORM_NOA(vclzb, 1, 28)
> -GEN_VXFORM_NOA(vclzh, 1, 29)
> +GEN_VXFORM_TRANS(vclzb, 1, 28)
> +GEN_VXFORM_TRANS(vclzh, 1, 29)
>  GEN_VXFORM_TRANS(vclzw, 1, 30)
>  GEN_VXFORM_TRANS(vclzd, 1, 31)
>  GEN_VXFORM_NOA_2(vnegw, 1, 24, 6)
> --
> 2.7.4
>
>
>

Re: [PATCH v7 1/3] target/ppc: Optimize emulation of vclzh and vclzb instructions

Reply via email to