On 2/14/21 9:58 AM, Philippe Mathieu-Daudé wrote: > Introduce the 'Parallel Extend Lower' opcodes:
$SUBJECT s/PEXTU/PEXTL/. > + /* Lower halve */ > + for (int i = 0; i < 64 / (2 * wlen); i++) { > + tcg_gen_deposit_i64(cpu_gpr[a->rd], > + cpu_gpr[a->rd], bx, 2 * wlen * i, wlen); > + tcg_gen_deposit_i64(cpu_gpr[a->rd], > + cpu_gpr[a->rd], ax, 2 * wlen * i + wlen, wlen); > + tcg_gen_shri_i64(bx, bx, wlen); > + tcg_gen_shri_i64(ax, ax, wlen); > + } > + /* Upper halve */ > + for (int i = 0; i < 64 / (2 * wlen); i++) { > + tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], > + cpu_gpr_hi[a->rd], bx, 2 * wlen * i, wlen); > + tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], > + cpu_gpr_hi[a->rd], ax, 2 * wlen * i + wlen, > wlen); > + tcg_gen_shri_i64(bx, bx, wlen); > + tcg_gen_shri_i64(ax, ax, wlen); > + } Right, so, this expands to (4 * 4 * 2) = 32 operations for pextlb, if deposit is supported, or ((4*2 + 2) * 4 * 2) = 80 operations if not (4 per deposit). We can do a bit better, though, exploiting parallelism. /* 5 or 8 operations, w/ or w/o deposit */ void gen_widen_b(TCGv_i64 d, TCGv_i64 s) { TCGv_i64 x = tcg_temp_new_i64(); TCGv_i64 y = tcg_temp_new_i64(); TCGv_i64 m0 = tcg_constant_i64(0x0000ff000000ff00ull); /* s = abcdefgh */ tcg_gen_deposit_i64(x, s, s, 16, 48); /* x = cdefghgh */ tcg_gen_and_i64(y, x, m); /* y = 00e000g0 */ tcg_gen_andc_i64(x, x, m0); /* x = 000f000h */ tcg_gen_shli_i64(y, y, 8); /* y = 0e000g00 */ tcg_gen_or_i64(d, x, y); /* d = 0e0f0g0h */ tcg_temp_free_i64(x); tcg_temp_free_i64(y); } /* 12 or 18 operations w/ or w/o deposit */ void gen_pextb(TCGv_i64 d, TCGv_i64 s, TCGv_i64 t) { TCGv_i64 x = tcg_temp_new_i64(); gen_widen_b(x, s); gen_widen_b(d, s); tcg_gen_shli_i64(x, x, 8); tcg_gen_or_i64(d, d, x); tcg_temp_free_i64(x); } then gen_read_gpr(s, a->rs); gen_read_gpr(t, a->rt); gen_pextb(cpu_gpr[a->rd], s, t); tcg_gen_shri_i64(s, s, 32); tcg_gen_shri_i64(t, t, 32); gen_pextb(cpu_gpr_hi[a->rd], s, t); gives you the result in 26 or 38 operations. Similarly void gen_widen_h(TCGv_i64 d, TCGv_i64 s) { TCGv_i64 x = tcg_temp_new_i64(); /* s = abcd */ tcg_gen_andi_i64(x, s, 0xffff0000u); /* x = 00c0 */ tcg_gen_deposit_i64(d, s, x, 16, 48); /* d = 0c0d */ tcg_temp_free_i64(x); } r~