Am 24.10.2014 um 23:06 schrieb Ian Romanick:
> On 10/24/2014 05:47 AM, Timothy Arceri wrote:
>> Makes use of SSE to speed up compute of min and max elements
>>
>> Callgrind cpu usage results from pts benchmarks:
>>
>> Openarena 0.8.8: 3.67% -> 1.03%
>> UrbanTerror: 2.36% -> 0.81%
>>
>> Signed-off-by: Timothy Arceri <t_arc...@yahoo.com.au>
>> ---
>>  src/mesa/Android.libmesa_dricore.mk |  3 +-
>>  src/mesa/Makefile.am                |  3 +-
>>  src/mesa/Makefile.sources           |  1 +
>>  src/mesa/main/sse_minmax.c          | 81 
>> +++++++++++++++++++++++++++++++++++++
>>  src/mesa/main/sse_minmax.h          | 30 ++++++++++++++
>>  src/mesa/vbo/vbo_exec_array.c       | 13 ++++--
>>  6 files changed, 126 insertions(+), 5 deletions(-)
>>  create mode 100644 src/mesa/main/sse_minmax.c
>>  create mode 100644 src/mesa/main/sse_minmax.h
>>
>> This version includes all the suggestions from Brian and Matt, thanks for
>> the review guys.
>>
>> I haven't been able to do Matt's suggestion and compare this to what OpenMP
>> would generate as I only have one machine that supports SSE4.1 with Fedora 
>> 20 and
>> I dont want to have to upgrade to Fedora 21 alpha (gcc 4.9) just to test this
>> (although I did consider it). If people are happy with this code I will 
>> revisit
>> OpenMP for Mesa 10.5 and will look at using OpenMP for the short and byte 
>> support too.
>>
>> diff --git a/src/mesa/Android.libmesa_dricore.mk 
>> b/src/mesa/Android.libmesa_dricore.mk
>> index 1e6d948..52d626f 100644
>> --- a/src/mesa/Android.libmesa_dricore.mk
>> +++ b/src/mesa/Android.libmesa_dricore.mk
>> @@ -51,7 +51,8 @@ endif # MESA_ENABLE_ASM
>>  
>>  ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
>>  LOCAL_SRC_FILES += \
>> -    $(SRCDIR)main/streaming-load-memcpy.c
>> +    $(SRCDIR)main/streaming-load-memcpy.c \
>> +    $(SRCDIR)main/sse_minmax.c
>>  LOCAL_CFLAGS := -msse4.1
>>  endif
>>  
>> diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
>> index e71bccb..932db4f 100644
>> --- a/src/mesa/Makefile.am
>> +++ b/src/mesa/Makefile.am
>> @@ -151,7 +151,8 @@ libmesagallium_la_LIBADD = \
>>      $(ARCH_LIBS)
>>  
>>  libmesa_sse41_la_SOURCES = \
>> -    main/streaming-load-memcpy.c
>> +    main/streaming-load-memcpy.c \
>> +    main/sse_minmax.c
>>  libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) -msse4.1
>>  
>>  pkgconfigdir = $(libdir)/pkgconfig
>> diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
>> index 4755018..dd10574 100644
>> --- a/src/mesa/Makefile.sources
>> +++ b/src/mesa/Makefile.sources
>> @@ -93,6 +93,7 @@ MAIN_FILES = \
>>      $(SRCDIR)main/shaderobj.c \
>>      $(SRCDIR)main/shader_query.cpp \
>>      $(SRCDIR)main/shared.c \
>> +    $(SRCDIR)main/sse_minmax.c \
>>      $(SRCDIR)main/state.c \
>>      $(SRCDIR)main/stencil.c \
>>      $(SRCDIR)main/syncobj.c \
>> diff --git a/src/mesa/main/sse_minmax.c b/src/mesa/main/sse_minmax.c
>> new file mode 100644
>> index 0000000..577f44e
>> --- /dev/null
>> +++ b/src/mesa/main/sse_minmax.c
>> @@ -0,0 +1,81 @@
>> +/*
>> + * Copyright © 2014 Timothy Arceri
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
>> OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
>> OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>> DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Author:
>> + *    Timothy Arceri <t_arc...@yahoo.com.au>
>> + *
>> + */
>> +
>> +#ifdef __SSE4_1__
>> +#include "main/glheader.h"
>> +#include "main/sse_minmax.h"
>> +#include <smmintrin.h>
>> +
>> +void
>> +_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
>> +                         unsigned *max_index, const unsigned count)
>> +{
>> +   unsigned i = 0;
>> +   unsigned max_ui = 0;
>> +   unsigned min_ui = ~0U;
>> +
>> +   if (count >= 4) {
>> +      unsigned max_arr[4] __attribute__ ((aligned (16)));
>> +      unsigned min_arr[4] __attribute__ ((aligned (16)));
>> +      unsigned vec_count;
>> +      __m128i max_ui4 = _mm_setzero_si128();
>> +      __m128i min_ui4 = _mm_set1_epi32(~0U);
>> +      __m128i ui_indices4;
>> +      __m128i *ui_indices_ptr;
>> +
>> +      vec_count = count & ~0x3;
>> +      ui_indices_ptr = (__m128i*)ui_indices;
>> +      for (i = 0; i < vec_count / 4; i++) {
>> +         ui_indices4 = _mm_loadu_si128(&ui_indices_ptr[i]);
> 
> How does this fare with unaligned data?  My recollection is that
> _mm_loadu_si128 could be quite a bit slower than _mm_load_si128.  It
> might be worth handling the first few values without SSE until the
> pointer is aligned.
> 
> Or my memory might be wrong.

IIRC  even for unaligned memory movdqu doesn't have much of a
performance penalty on newer Core cpus (sandy bridge and newer?) - only
some when actually crossing cache line boundaries, but not too severe
(though the optimization guides don't tell how large). There's probably
lots of cpus out there though where the penalty could be quite large so
using aligned loads might be a good idea.

Roland

>> +         max_ui4 = _mm_max_epu32(ui_indices4, max_ui4);
>> +         min_ui4 = _mm_min_epu32(ui_indices4, min_ui4);
>> +      }
>> +
>> +      _mm_store_si128((__m128i*)max_arr, max_ui4);
>> +      _mm_store_si128((__m128i*)min_arr, min_ui4);
>> +
>> +      for (i = 0; i < 4; i++) {
>> +         if (max_arr[i] > max_ui)
>> +            max_ui = max_arr[i];
>> +         if (min_arr[i] < min_ui)
>> +            min_ui = min_arr[i];
>> +      }
>> +      i = vec_count;
>> +   }
>> +
>> +   for (; i < count; i++) {
>> +      if (ui_indices[i] > max_ui)
>> +         max_ui = ui_indices[i];
>> +      if (ui_indices[i] < min_ui)
>> +         min_ui = ui_indices[i];
>> +   }
>> +
>> +   *min_index = min_ui;
>> +   *max_index = max_ui;
>> +}
>> +
>> +#endif
>> diff --git a/src/mesa/main/sse_minmax.h b/src/mesa/main/sse_minmax.h
>> new file mode 100644
>> index 0000000..953c4e9
>> --- /dev/null
>> +++ b/src/mesa/main/sse_minmax.h
>> @@ -0,0 +1,30 @@
>> +/*
>> + * Copyright © 2014 Timothy Arceri
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the 
>> "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
>> OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
>> OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>> DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Author:
>> + *    Timothy Arceri <t_arc...@yahoo.com.au>
>> + *
>> + */
>> +
>> +void
>> +_mesa_uint_array_min_max(const unsigned *ui_indices, unsigned *min_index,
>> +                         unsigned *max_index, const unsigned count);
>> diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
>> index 045dbb5..f857da4 100644
>> --- a/src/mesa/vbo/vbo_exec_array.c
>> +++ b/src/mesa/vbo/vbo_exec_array.c
>> @@ -36,6 +36,8 @@
>>  #include "main/enums.h"
>>  #include "main/macros.h"
>>  #include "main/transformfeedback.h"
>> +#include "main/sse_minmax.h"
>> +#include "x86/common_x86_asm.h"
>>  
>>  #include "vbo_context.h"
>>  
>> @@ -119,9 +121,14 @@ vbo_get_minmax_index(struct gl_context *ctx,
>>           }
>>        }
>>        else {
>> -         for (i = 0; i < count; i++) {
>> -            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
>> -            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
>> +         if (cpu_has_sse4_1) {
>> +            _mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count);
>> +         }
>> +         else {
>> +            for (i = 0; i < count; i++) {
>> +               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
>> +               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
>> +            }
>>           }
>>        }
>>        *min_index = min_ui;
>>
>>
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=F4msKE2WxRzA%2BwN%2B25muztFm5TSPwE8HKJfWfR2NgfY%3D%0A&m=LwBPXKJD2HNZ%2Be%2BmFAxLkmbWJ%2B7B0CRueVIXxSuOmv0%3D%0A&s=34e091c64218e9636106b996236961e26c46480a5064ca945475272cdaf818c8
>>
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://urldefense.proofpoint.com/v1/url?u=http://lists.freedesktop.org/mailman/listinfo/mesa-dev&k=oIvRg1%2BdGAgOoM1BIlLLqw%3D%3D%0A&r=F4msKE2WxRzA%2BwN%2B25muztFm5TSPwE8HKJfWfR2NgfY%3D%0A&m=LwBPXKJD2HNZ%2Be%2BmFAxLkmbWJ%2B7B0CRueVIXxSuOmv0%3D%0A&s=34e091c64218e9636106b996236961e26c46480a5064ca945475272cdaf818c8
> 

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to