On Thu, Dec 31, 2015 at 4:13 AM, Roland Scheidegger <srol...@vmware.com> wrote: > > Am 30.12.2015 um 10:59 schrieb Oded Gabbay: > > On Wed, Dec 30, 2015 at 1:17 AM, Roland Scheidegger <srol...@vmware.com> > > wrote: > >> The idea looks right to me. > >> Though frankly I don't like our current setup code too much - in > >> particular the mix between c, assembly, and jit code, with some > >> duplication (plus the lots of transpose everywhere). There's likely > >> optimization potential to be found there. > >> > >> Roland > > > > Agreed, if we can remove the transpose, that would be some 3-5% boost for > > sure. > > > > I actually forgot about this, but I'm really surprised you could measure > a difference as this code (but not the one in rasterization) is > effectively disabled in practice ever since the switch to 8 bit subpixel > precision (as the max fb size allowed now is really tiny, 128x128...) > Unless you were playing with those values... > Should have probably added some XXX comment back then... > > Roland
Indeed, glxgears and glxspheres don't reach this code. However, glmark2 has a couple of tests that do run this code (I put prints to check for it). But I'll add the comment you added in the patch you just sent. Oded > > > > > >> > >> Am 29.12.2015 um 17:12 schrieb Oded Gabbay: > >>> This patch converts the SSE optimization done in do_triangle_ccw to > >>> VMX/VSX. > >>> > >>> I measured the results on POWER8 machine with 32 cores at 3.4GHz and > >>> 16GB of RAM. > >>> > >>> FPS/Score > >>> Name Before After Delta > >>> ------------------------------------------------ > >>> glmark2 (score) 136.6 139.8 2.34% > >>> openarena 16.14 16.35 1.30% > >>> xonotic 4.655 4.707 1.11% > >>> > >>> Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> > >>> --- > >>> src/gallium/drivers/llvmpipe/lp_setup_tri.c | 96 > >>> +++++++++++++++++++++++++++++ > >>> 1 file changed, 96 insertions(+) > >>> > >>> diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c > >>> b/src/gallium/drivers/llvmpipe/lp_setup_tri.c > >>> index b1671dd..cfa9874 100644 > >>> --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c > >>> +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c > >>> @@ -46,6 +46,9 @@ > >>> > >>> #if defined(PIPE_ARCH_SSE) > >>> #include <emmintrin.h> > >>> +#elif defined(_ARCH_PWR8) > >>> +#include <altivec.h> > >>> +#include "util/u_pwr8.h" > >>> #endif > >>> > >>> static inline int > >>> @@ -462,6 +465,99 @@ do_triangle_ccw(struct lp_setup_context *setup, > >>> STORE_PLANE(plane[2], p2); > >>> #undef STORE_PLANE > >>> } else > >>> +#elif defined(_ARCH_PWR8) > >>> + if (setup->fb.width <= MAX_FIXED_LENGTH32 && > >>> + setup->fb.height <= MAX_FIXED_LENGTH32 && > >>> + (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 && > >>> + (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) { > >>> + unsigned int bottom_edge; > >>> + __m128i vertx, verty; > >>> + __m128i shufx, shufy; > >>> + __m128i dcdx, dcdy, c; > >>> + __m128i unused; > >>> + __m128i dcdx_neg_mask; > >>> + __m128i dcdy_neg_mask; > >>> + __m128i dcdx_zero_mask; > >>> + __m128i top_left_flag; > >>> + __m128i c_inc_mask, c_inc; > >>> + __m128i eo, p0, p1, p2; > >>> + __m128i_union vshuf_mask; > >>> + __m128i zero = vec_splats((unsigned char) 0); > >>> + PIPE_ALIGN_VAR(16) int32_t temp_vec[4]; > >>> + > >>> +#ifdef PIPE_ARCH_LITTLE_ENDIAN > >>> + vshuf_mask.i[0] = 0x07060504; > >>> + vshuf_mask.i[1] = 0x0B0A0908; > >>> + vshuf_mask.i[2] = 0x03020100; > >>> + vshuf_mask.i[3] = 0x0F0E0D0C; > >>> +#else > >>> + vshuf_mask.i[0] = 0x00010203; > >>> + vshuf_mask.i[1] = 0x0C0D0E0F; > >>> + vshuf_mask.i[2] = 0x04050607; > >>> + vshuf_mask.i[3] = 0x08090A0B; > >>> +#endif > >>> + > >>> + /* vertex x coords */ > >>> + vertx = vec_loadu_si128((const uint32_t *) position->x); > >>> + /* vertex y coords */ > >>> + verty = vec_loadu_si128((const uint32_t *) position->y); > >>> + > >>> + shufx = vec_perm (vertx, vertx, vshuf_mask.m128i); > >>> + shufy = vec_perm (verty, verty, vshuf_mask.m128i); > >>> + > >>> + dcdx = vec_sub_epi32(verty, shufy); > >>> + dcdy = vec_sub_epi32(vertx, shufx); > >>> + > >>> + dcdx_neg_mask = vec_srai_epi32(dcdx, 31); > >>> + dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero); > >>> + dcdy_neg_mask = vec_srai_epi32(dcdy, 31); > >>> + > >>> + bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0; > >>> + top_left_flag = (__m128i) vec_splats(bottom_edge); > >>> + > >>> + c_inc_mask = vec_or(dcdx_neg_mask, > >>> + vec_and(dcdx_zero_mask, > >>> + vec_xor(dcdy_neg_mask, > >>> + > >>> top_left_flag))); > >>> + > >>> + c_inc = vec_srli_epi32(c_inc_mask, 31); > >>> + > >>> + c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx), > >>> + vec_mullo_epi32(dcdy, verty)); > >>> + > >>> + c = vec_add_epi32(c, c_inc); > >>> + > >>> + /* Scale up to match c: > >>> + */ > >>> + dcdx = vec_slli_epi32(dcdx, FIXED_ORDER); > >>> + dcdy = vec_slli_epi32(dcdy, FIXED_ORDER); > >>> + > >>> + /* Calculate trivial reject values: > >>> + */ > >>> + eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy), > >>> + vec_and(dcdx_neg_mask, dcdx)); > >>> + > >>> + /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */ > >>> + > >>> + /* Pointless transpose which gets undone immediately in > >>> + * rasterization: > >>> + */ > >>> + transpose4_epi32(&c, &dcdx, &dcdy, &eo, > >>> + &p0, &p1, &p2, &unused); > >>> + > >>> +#define STORE_PLANE(plane, vec) do { \ > >>> + vec_store_si128((uint32_t *)&temp_vec, vec); \ > >>> + plane.c = (int64_t)temp_vec[0]; \ > >>> + plane.dcdx = temp_vec[1]; \ > >>> + plane.dcdy = temp_vec[2]; \ > >>> + plane.eo = temp_vec[3]; \ > >>> + } while(0) > >>> + > >>> + STORE_PLANE(plane[0], p0); > >>> + STORE_PLANE(plane[1], p1); > >>> + STORE_PLANE(plane[2], p2); > >>> +#undef STORE_PLANE > >>> + } else > >>> #endif > >>> { > >>> int i; > >>> > >> > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev