On 8/27/19 2:37 AM, Stefan Brankovic wrote: > + for (i = 0; i < 4; i++) { > + switch (i) { > + case 0: > + /* > + * Get high doubleword of vA to perfrom 6-5-5 pack of pixels > + * 1 and 2. > + */ > + get_avr64(avr, VA, true); > + tcg_gen_movi_i64(result, 0x0ULL); > + break; > + case 1: > + /* > + * Get low doubleword of vA to perfrom 6-5-5 pack of pixels > + * 3 and 4. > + */ > + get_avr64(avr, VA, false); > + break; > + case 2: > + /* > + * Get high doubleword of vB to perfrom 6-5-5 pack of pixels > + * 5 and 6. > + */ > + get_avr64(avr, VB, true); > + tcg_gen_movi_i64(result, 0x0ULL); > + break; > + case 3: > + /* > + * Get low doubleword of vB to perfrom 6-5-5 pack of pixels > + * 7 and 8. > + */ > + get_avr64(avr, VB, false); > + break; > + } > + /* Perform the packing for 2 pixels(each iteration for 1). */ > + tcg_gen_movi_i64(tmp, 0x0ULL); > + for (j = 0; j < 2; j++) { > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 3)); > + tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 6)); > + tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 9)); > + tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + } > + if ((i == 0) || (i == 2)) { > + tcg_gen_shli_i64(tmp, tmp, 32); > + } > + tcg_gen_or_i64(result, result, tmp); > + if (i == 1) { > + /* Place packed pixels 1:4 to high doubleword of vD. */ > + tcg_gen_mov_i64(result1, result); > + } > + if (i == 3) { > + /* Place packed pixels 5:8 to low doubleword of vD. */ > + tcg_gen_mov_i64(result2, result); > + } > + } > + set_avr64(VT, result1, true); > + set_avr64(VT, result2, false);
I really have a hard time believing that it is worthwhile to inline all of this code. By my count this is 82 non-move opcodes. That is a *lot* of inline expansion. However, I can well imagine that the existing out-of-line helper is less than optimal. > -void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) > -{ > - int i, j; > - ppc_avr_t result; > -#if defined(HOST_WORDS_BIGENDIAN) > - const ppc_avr_t *x[2] = { a, b }; > -#else > - const ppc_avr_t *x[2] = { b, a }; > -#endif > - > - VECTOR_FOR_INORDER_I(i, u64) { > - VECTOR_FOR_INORDER_I(j, u32) { > - uint32_t e = x[i]->u32[j]; Double indirect loads? > - > - result.u16[4 * i + j] = (((e >> 9) & 0xfc00) | > - ((e >> 6) & 0x3e0) | > - ((e >> 3) & 0x1f)); Store to temporary ... > - } > - } > - *r = result; ... and then copy? Try replacing the existing helper with something like the following. r~ static inline uint64_t pkpx_1(uint64_t a, int shr, int shl) { uint64_t r; r = ((a >> (shr + 9)) & 0x3f) << shl; r |= ((a >> (shr + 6)) & 0x1f) << shl; r |= ((a >> (shr + 3)) & 0x1f) << shl; return r; } static inline uint64_t pkpx_2(uint64_t ah, uint64_t al) { return pkpx_1(ah, 32, 48) | pkpx_1(ah, 0, 32) | pkpx_1(al, 32, 16) | pkpx_1(al, 0, 0); } void helper_vpkpx(uint64_t *r, uint64_t *a, uint64_t *b) { uint64_t rh = pkpx_2(a->VsrD(0), a->VsrD(1)); uint64_t rl = pkpx_2(b->VsrD(0), b->VsrD(1)); r->VsrD(0) = rh; r->VsrD(1) = rl; }