mas...@eltechs.com writes: > From: Maxim Maslov <mas...@eltechs.com>
The commit message needs some explanation of why we would want that (given that 2835 is an ARM) and some performance data justifying the change. > > --- src/gallium/drivers/vc4/vc4_tiling_lt.c | 93 >+++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 3 >deletions(-) > > diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c > b/src/gallium/drivers/vc4/vc4_tiling_lt.c > index c9cbc65..d291262 100644 > --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c > +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c > @@ -105,6 +105,49 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t > cpu_stride, uint32_t cpp) > : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) > : "q0", "q1", "q2", "q3"); > } > +#elif defined(USE_SSE_ASM) > + if (gpu_stride == 8) { > + __asm__ volatile ( > + "movdqu 0(%1), %%xmm0;" > + "movdqu 0x10(%1), %%xmm1;" > + "movdqu 0x20(%1), %%xmm2;" > + "movdqu 0x30(%1), %%xmm3;" > + "movlpd %%xmm0, 0(%0);" > + "mov %2, %%ecx;" > + "movhpd %%xmm0, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movlpd %%xmm1, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movhpd %%xmm1, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movlpd %%xmm2, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movhpd %%xmm2, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movlpd %%xmm3, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movhpd %%xmm3, 0(%0,%%ecx,1);" > + : > + : "r"(cpu), "r"(gpu), "r"(cpu_stride) > + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); > + } else { > + assert(gpu_stride == 16); > + __asm__ volatile ( > + "movdqu 0(%1), %%xmm0;" > + "movdqu 0x10(%1), %%xmm1;" > + "movdqu 0x20(%1), %%xmm2;" > + "movdqu 0x30(%1), %%xmm3;" > + "movdqu %%xmm0, 0(%0);" > + "mov %2, %%ecx;" > + "movdqu %%xmm1, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movdqu %%xmm2, 0(%0,%%ecx,1);" > + "add %2, %%ecx;" > + "movdqu %%xmm3, 0(%0,%%ecx,1);" > + : > + : "r"(cpu), "r"(gpu), "r"(cpu_stride) > + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); > + } Using SSE in Mesa requires runtime detection if SSE is actually present. > #endif > - > } > > void > @@ -175,6 +260,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t > dst_stride, > int cpp, const struct pipe_box *box) > { > uint32_t utile_w = vc4_utile_width(cpp); > + uint32_t xfactor = 64 / utile_w; > uint32_t utile_h = vc4_utile_height(cpp); > uint32_t xstart = box->x; > uint32_t ystart = box->y; > @@ -184,7 +270,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t > dst_stride, > vc4_load_utile(dst + (dst_stride * y + > x * cpp), > src + ((ystart + y) * src_stride + > - (xstart + x) * 64 / utile_w), > + (xstart + x) * xfactor), > dst_stride, cpp); > } > } > @@ -196,6 +282,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t > dst_stride, > int cpp, const struct pipe_box *box) > { > uint32_t utile_w = vc4_utile_width(cpp); > + uint32_t xfactor = 64 / utile_w; > uint32_t utile_h = vc4_utile_height(cpp); > uint32_t xstart = box->x; > uint32_t ystart = box->y; > @@ -203,7 +290,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t > dst_stride, > for (uint32_t y = 0; y < box->height; y += utile_h) { > for (int x = 0; x < box->width; x += utile_w) { > vc4_store_utile(dst + ((ystart + y) * dst_stride + > - (xstart + x) * 64 / utile_w), > + (xstart + x) * xfactor), > src + (src_stride * y + > x * cpp), > src_stride, cpp); > -- > 2.7.4 Unrelated changes should be a separate commit. (I would expect that this change doesn't do anything, because the compiler moves the math out of the loop anyway).
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev