Also cleans up some if statements in the *faster functions. Callgrind cpu usage results from pts benchmarks:
For ytile_copy_faster() Nexuiz 1.6.1: 2.16% -> 1.20% Signed-off-by: Timothy Arceri <t_arc...@yahoo.com.au> --- src/mesa/Makefile.am | 8 +++ src/mesa/drivers/dri/i965/intel_tex_subimage.c | 82 ++++++-------------------- src/mesa/main/fast_rgba8_copy.c | 78 ++++++++++++++++++++++++ src/mesa/main/fast_rgba8_copy.h | 37 ++++++++++++ 4 files changed, 141 insertions(+), 64 deletions(-) create mode 100644 src/mesa/main/fast_rgba8_copy.c create mode 100644 src/mesa/main/fast_rgba8_copy.h diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am index e71bccb..2402096 100644 --- a/src/mesa/Makefile.am +++ b/src/mesa/Makefile.am @@ -107,6 +107,10 @@ AM_CXXFLAGS = $(LLVM_CFLAGS) $(VISIBILITY_CXXFLAGS) ARCH_LIBS = +if SSSE3_SUPPORTED +ARCH_LIBS += libmesa_ssse3.la +endif + if SSE41_SUPPORTED ARCH_LIBS += libmesa_sse41.la endif @@ -154,6 +158,10 @@ libmesa_sse41_la_SOURCES = \ main/streaming-load-memcpy.c libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1 +libmesa_ssse3_la_SOURCES = \ + main/fast_rgba8_copy.c +libmesa_ssse3_la_CFLAGS = $(AM_CFLAGS) -mssse3 + pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = gl.pc diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c index cb5738a..0deeb75 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c @@ -27,6 +27,7 @@ **************************************************************************/ #include "main/bufferobj.h" +#include "main/fast_rgba8_copy.h" #include "main/image.h" #include "main/macros.h" #include "main/mtypes.h" @@ -42,9 +43,7 @@ #include "intel_mipmap_tree.h" #include "intel_blit.h" -#ifdef __SSSE3__ -#include <tmmintrin.h> -#endif +#include "x86/common_x86_asm.h" #define FILE_DEBUG_FLAG DEBUG_TEXTURE @@ -175,18 +174,6 @@ err: return false; } -#ifdef __SSSE3__ -static const uint8_t rgba8_permutation[16] = - { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; - -/* NOTE: dst must be 16 byte aligned */ -#define rgba8_copy_16(dst, src) \ - *(__m128i *)(dst) = _mm_shuffle_epi8( \ - (__m128i) _mm_loadu_ps((float *)(src)), \ - *(__m128i *) rgba8_permutation \ - ) -#endif - /** * Copy RGBA to BGRA - swap R and B. */ @@ -196,29 +183,6 @@ rgba8_copy(void *dst, const void *src, size_t bytes) uint8_t *d = dst; uint8_t const *s = src; -#ifdef __SSSE3__ - /* Fast copying for tile spans. - * - * As long as the destination texture is 16 aligned, - * any 16 or 64 spans we get here should also be 16 aligned. - */ - - if (bytes == 16) { - assert(!(((uintptr_t)dst) & 0xf)); - rgba8_copy_16(d+ 0, s+ 0); - return dst; - } - - if (bytes == 64) { - assert(!(((uintptr_t)dst) & 0xf)); - rgba8_copy_16(d+ 0, s+ 0); - rgba8_copy_16(d+16, s+16); - rgba8_copy_16(d+32, s+32); - rgba8_copy_16(d+48, s+48); - return dst; - } -#endif - while (bytes >= 4) { d[0] = s[2]; d[1] = s[1]; @@ -352,19 +316,8 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy_fn mem_copy) { if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { - if (mem_copy == memcpy) - return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy) - return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, src_pitch, swizzle_bit, rgba8_copy); - } else { - if (mem_copy == memcpy) - return xtile_copy(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy) - return xtile_copy(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, rgba8_copy); + return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, + dst, src, src_pitch, swizzle_bit, mem_copy); } xtile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, mem_copy); @@ -388,19 +341,8 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy_fn mem_copy) { if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { - if (mem_copy == memcpy) - return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy) - return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, src_pitch, swizzle_bit, rgba8_copy); - } else { - if (mem_copy == memcpy) - return ytile_copy(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy) - return ytile_copy(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, rgba8_copy); + return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, + dst, src, src_pitch, swizzle_bit, mem_copy); } ytile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, mem_copy); @@ -582,6 +524,12 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, if (format == GL_BGRA) { mem_copy = memcpy; } else if (format == GL_RGBA) { + #if defined(USE_SSSE3) + if (cpu_has_ssse3) { + mem_copy = _mesa_fast_rgba8_copy; + } + else + #endif mem_copy = rgba8_copy; } } else if ((texImage->TexFormat == MESA_FORMAT_R8G8B8A8_UNORM) || @@ -591,6 +539,12 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can * use the same function. */ + #if defined(USE_SSSE3) + if (cpu_has_ssse3) { + mem_copy = _mesa_fast_rgba8_copy; + } + else + #endif mem_copy = rgba8_copy; } else if (format == GL_RGBA) { mem_copy = memcpy; diff --git a/src/mesa/main/fast_rgba8_copy.c b/src/mesa/main/fast_rgba8_copy.c new file mode 100644 index 0000000..7ac3f3b --- /dev/null +++ b/src/mesa/main/fast_rgba8_copy.c @@ -0,0 +1,78 @@ +/* + * Copyright 2003 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifdef __SSSE3__ +#include "main/fast_rgba8_copy.h" +#include <tmmintrin.h> + +static const uint8_t rgba8_permutation[16] = + { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; + +/* NOTE: dst must be 16 byte aligned */ +#define rgba8_copy_16(dst, src) \ + *(__m128i *)(dst) = _mm_shuffle_epi8( \ + (__m128i) _mm_loadu_ps((float *)(src)), \ + *(__m128i *) rgba8_permutation \ + ) + +/* Fast copying for tile spans. + * + * As long as the destination texture is 16 aligned, + * any 16 or 64 spans we get here should also be 16 aligned. + */ +void * +_mesa_fast_rgba8_copy(void *dst, const void *src, size_t bytes) +{ + uint8_t *d = dst; + uint8_t const *s = src; + + if (bytes == 16) { + assert(!(((uintptr_t)dst) & 0xf)); + rgba8_copy_16(d+ 0, s+ 0); + return dst; + } + + if (bytes == 64) { + assert(!(((uintptr_t)dst) & 0xf)); + rgba8_copy_16(d+ 0, s+ 0); + rgba8_copy_16(d+16, s+16); + rgba8_copy_16(d+32, s+32); + rgba8_copy_16(d+48, s+48); + return dst; + } + + while (bytes >= 4) { + d[0] = s[2]; + d[1] = s[1]; + d[2] = s[0]; + d[3] = s[3]; + d += 4; + s += 4; + bytes -= 4; + } + return dst; +} +#endif diff --git a/src/mesa/main/fast_rgba8_copy.h b/src/mesa/main/fast_rgba8_copy.h new file mode 100644 index 0000000..2154daa --- /dev/null +++ b/src/mesa/main/fast_rgba8_copy.h @@ -0,0 +1,37 @@ +/* + * Copyright 2003 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <assert.h> +#include <stdint.h> +#include <stddef.h> + +/* Fast copying for tile spans. + * + * As long as the destination texture is 16 aligned, + * any 16 or 64 spans we get here should also be 16 aligned. + */ +void * +_mesa_fast_rgba8_copy(void *dst, const void *src, size_t n); -- 1.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev