On Wed, Dec 30, 2015 at 1:17 AM, Roland Scheidegger <srol...@vmware.com> wrote: > The idea looks right to me. > Though frankly I don't like our current setup code too much - in > particular the mix between c, assembly, and jit code, with some > duplication (plus the lots of transpose everywhere). There's likely > optimization potential to be found there. > > Roland
Agreed, if we can remove the transpose, that would be some 3-5% boost for sure. Oded > > Am 29.12.2015 um 17:12 schrieb Oded Gabbay: >> This patch converts the SSE optimization done in do_triangle_ccw to >> VMX/VSX. >> >> I measured the results on POWER8 machine with 32 cores at 3.4GHz and >> 16GB of RAM. >> >> FPS/Score >> Name Before After Delta >> ------------------------------------------------ >> glmark2 (score) 136.6 139.8 2.34% >> openarena 16.14 16.35 1.30% >> xonotic 4.655 4.707 1.11% >> >> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> >> --- >> src/gallium/drivers/llvmpipe/lp_setup_tri.c | 96 >> +++++++++++++++++++++++++++++ >> 1 file changed, 96 insertions(+) >> >> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> b/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> index b1671dd..cfa9874 100644 >> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c >> @@ -46,6 +46,9 @@ >> >> #if defined(PIPE_ARCH_SSE) >> #include <emmintrin.h> >> +#elif defined(_ARCH_PWR8) >> +#include <altivec.h> >> +#include "util/u_pwr8.h" >> #endif >> >> static inline int >> @@ -462,6 +465,99 @@ do_triangle_ccw(struct lp_setup_context *setup, >> STORE_PLANE(plane[2], p2); >> #undef STORE_PLANE >> } else >> +#elif defined(_ARCH_PWR8) >> + if (setup->fb.width <= MAX_FIXED_LENGTH32 && >> + setup->fb.height <= MAX_FIXED_LENGTH32 && >> + (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && >> + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { >> + unsigned int bottom_edge; >> + __m128i vertx, verty; >> + __m128i shufx, shufy; >> + __m128i dcdx, dcdy, c; >> + __m128i unused; >> + __m128i dcdx_neg_mask; >> + __m128i dcdy_neg_mask; >> + __m128i dcdx_zero_mask; >> + __m128i top_left_flag; >> + __m128i c_inc_mask, c_inc; >> + __m128i eo, p0, p1, p2; >> + __m128i_union vshuf_mask; >> + __m128i zero = vec_splats((unsigned char) 0); >> + PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; >> + >> +#ifdef PIPE_ARCH_LITTLE_ENDIAN >> + vshuf_mask.i[0] = 0x07060504; >> + vshuf_mask.i[1] = 0x0B0A0908; >> + vshuf_mask.i[2] = 0x03020100; >> + vshuf_mask.i[3] = 0x0F0E0D0C; >> +#else >> + vshuf_mask.i[0] = 0x00010203; >> + vshuf_mask.i[1] = 0x0C0D0E0F; >> + vshuf_mask.i[2] = 0x04050607; >> + vshuf_mask.i[3] = 0x08090A0B; >> +#endif >> + >> + /* vertex x coords */ >> + vertx = vec_loadu_si128((const uint32_t *) position->x); >> + /* vertex y coords */ >> + verty = vec_loadu_si128((const uint32_t *) position->y); >> + >> + shufx = vec_perm (vertx, vertx, vshuf_mask.m128i); >> + shufy = vec_perm (verty, verty, vshuf_mask.m128i); >> + >> + dcdx = vec_sub_epi32(verty, shufy); >> + dcdy = vec_sub_epi32(vertx, shufx); >> + >> + dcdx_neg_mask = vec_srai_epi32(dcdx, 31); >> + dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero); >> + dcdy_neg_mask = vec_srai_epi32(dcdy, 31); >> + >> + bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0; >> + top_left_flag = (__m128i) vec_splats(bottom_edge); >> + >> + c_inc_mask = vec_or(dcdx_neg_mask, >> + vec_and(dcdx_zero_mask, >> + vec_xor(dcdy_neg_mask, >> + >> top_left_flag))); >> + >> + c_inc = vec_srli_epi32(c_inc_mask, 31); >> + >> + c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx), >> + vec_mullo_epi32(dcdy, verty)); >> + >> + c = vec_add_epi32(c, c_inc); >> + >> + /* Scale up to match c: >> + */ >> + dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); >> + dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); >> + >> + /* Calculate trivial reject values: >> + */ >> + eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy), >> + vec_and(dcdx_neg_mask, dcdx)); >> + >> + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ >> + >> + /* Pointless transpose which gets undone immediately in >> + * rasterization: >> + */ >> + transpose4_epi32(&c, &dcdx, &dcdy, &eo, >> + &p0, &p1, &p2, &unused); >> + >> +#define STORE_PLANE(plane, vec) do { \ >> + vec_store_si128((uint32_t *)&temp_vec, vec); \ >> + plane.c = (int64_t)temp_vec[0]; \ >> + plane.dcdx = temp_vec[1]; \ >> + plane.dcdy = temp_vec[2]; \ >> + plane.eo = temp_vec[3]; \ >> + } while(0) >> + >> + STORE_PLANE(plane[0], p0); >> + STORE_PLANE(plane[1], p1); >> + STORE_PLANE(plane[2], p2); >> +#undef STORE_PLANE >> + } else >> #endif >> { >> int i; >> > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev