Callgrind cpu usage results from pts benchmarks: For ytile_copy_faster()
Nexuiz 1.6.1: 2.48% -> 0.97% V3: - rather than putting the ssse3 code in a different file in order to compile make use of gcc pragma for per function optimisations. Results in improved performace and less impact on those not needing runtime ssse3 checks. V2: - put back the if statements and add one for the SSSE3 rgba8_copy - move some header files out of the header - don't indent the preprocessor tests - changed copyright to Google and add author Frank Henigman Signed-off-by: Timothy Arceri <t_arc...@yahoo.com.au> --- src/mesa/drivers/dri/i965/intel_tex_subimage.c | 88 +++++++++++++++++++++----- 1 file changed, 73 insertions(+), 15 deletions(-) diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c index cb5738a..c6eda5c 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c @@ -42,8 +42,13 @@ #include "intel_mipmap_tree.h" #include "intel_blit.h" -#ifdef __SSSE3__ +#include "x86/common_x86_asm.h" +#include "x86/x86_function_opt.h" + +#if defined(SSSE3_FUNC_OPT_START) +SSSE3_FUNC_OPT_START #include <tmmintrin.h> +SSSE3_FUNC_OPT_END #endif #define FILE_DEBUG_FLAG DEBUG_TEXTURE @@ -175,7 +180,8 @@ err: return false; } -#ifdef __SSSE3__ +#if defined(SSSE3_FUNC_OPT_START) +SSSE3_FUNC_OPT_START static const uint8_t rgba8_permutation[16] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; @@ -185,24 +191,18 @@ static const uint8_t rgba8_permutation[16] = (__m128i) _mm_loadu_ps((float *)(src)), \ *(__m128i *) rgba8_permutation \ ) -#endif -/** - * Copy RGBA to BGRA - swap R and B. +/* Fast copying for tile spans. + * + * As long as the destination texture is 16 aligned, + * any 16 or 64 spans we get here should also be 16 aligned. */ static inline void * -rgba8_copy(void *dst, const void *src, size_t bytes) +ssse3_fast_rgba8_copy(void *dst, const void *src, size_t bytes) { uint8_t *d = dst; uint8_t const *s = src; -#ifdef __SSSE3__ - /* Fast copying for tile spans. - * - * As long as the destination texture is 16 aligned, - * any 16 or 64 spans we get here should also be 16 aligned. - */ - if (bytes == 16) { assert(!(((uintptr_t)dst) & 0xf)); rgba8_copy_16(d+ 0, s+ 0); @@ -217,8 +217,30 @@ rgba8_copy(void *dst, const void *src, size_t bytes) rgba8_copy_16(d+48, s+48); return dst; } + + while (bytes >= 4) { + d[0] = s[2]; + d[1] = s[1]; + d[2] = s[0]; + d[3] = s[3]; + d += 4; + s += 4; + bytes -= 4; + } + return dst; +} +SSSE3_FUNC_OPT_END #endif +/** + * Copy RGBA to BGRA - swap R and B. + */ +static inline void * +rgba8_copy(void *dst, const void *src, size_t bytes) +{ + uint8_t *d = dst; + uint8_t const *s = src; + while (bytes >= 4) { d[0] = s[2]; d[1] = s[1]; @@ -355,6 +377,12 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (mem_copy == memcpy) return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, src_pitch, swizzle_bit, memcpy); + #if defined(SSSE3_FUNC_OPT_START) + else if (mem_copy == ssse3_fast_rgba8_copy) + return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, + dst, src, src_pitch, swizzle_bit, + ssse3_fast_rgba8_copy); + #endif else if (mem_copy == rgba8_copy) return xtile_copy(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, src_pitch, swizzle_bit, rgba8_copy); @@ -362,6 +390,12 @@ xtile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (mem_copy == memcpy) return xtile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, memcpy); + #if defined(SSSE3_FUNC_OPT_START) + else if (mem_copy == ssse3_fast_rgba8_copy) + return xtile_copy(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + ssse3_fast_rgba8_copy); + #endif else if (mem_copy == rgba8_copy) return xtile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, rgba8_copy); @@ -391,6 +425,12 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (mem_copy == memcpy) return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, memcpy); + #if defined(SSSE3_FUNC_OPT_START) + else if (mem_copy == ssse3_fast_rgba8_copy) + return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, + dst, src, src_pitch, swizzle_bit, + ssse3_fast_rgba8_copy); + #endif else if (mem_copy == rgba8_copy) return ytile_copy(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, rgba8_copy); @@ -398,6 +438,12 @@ ytile_copy_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (mem_copy == memcpy) return ytile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, memcpy); + #if defined(SSSE3_FUNC_OPT_START) + else if (mem_copy == ssse3_fast_rgba8_copy) + return ytile_copy(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + ssse3_fast_rgba8_copy); + #endif else if (mem_copy == rgba8_copy) return ytile_copy(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, rgba8_copy); @@ -582,7 +628,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, if (format == GL_BGRA) { mem_copy = memcpy; } else if (format == GL_RGBA) { - mem_copy = rgba8_copy; + #if defined(SSSE3_FUNC_OPT_START) + if (cpu_has_ssse3) { + mem_copy = ssse3_fast_rgba8_copy; + } + else + #endif + mem_copy = rgba8_copy; } } else if ((texImage->TexFormat == MESA_FORMAT_R8G8B8A8_UNORM) || (texImage->TexFormat == MESA_FORMAT_R8G8B8X8_UNORM)) { @@ -591,7 +643,13 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can * use the same function. */ - mem_copy = rgba8_copy; + #if defined(SSSE3_FUNC_OPT_START) + if (cpu_has_ssse3) { + mem_copy = ssse3_fast_rgba8_copy; + } + else + #endif + mem_copy = rgba8_copy; } else if (format == GL_RGBA) { mem_copy = memcpy; } -- 1.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev