Quoting Scott D Phillips (2018-04-03 21:05:42) > The reference for MOVNTDQA says: > > For WC memory type, the nontemporal hint may be implemented by > loading a temporary internal buffer with the equivalent of an > aligned cache line without filling this data to the cache. > [...] Subsequent MOVNTDQA reads to unread portions of the WC > cache line will receive data from the temporary internal > buffer if data is available. > > This hidden cache line sized temporary buffer can improve the > read performance from wc maps. > --- > src/mesa/drivers/dri/i965/Makefile.am | 7 ++++ > src/mesa/drivers/dri/i965/Makefile.sources | 6 ++- > src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 52 > ++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/meson.build | 18 +++++++-- > 4 files changed, 78 insertions(+), 5 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/Makefile.am > b/src/mesa/drivers/dri/i965/Makefile.am > index 889d4c68a2b..ff47add93f4 100644 > --- a/src/mesa/drivers/dri/i965/Makefile.am > +++ b/src/mesa/drivers/dri/i965/Makefile.am > @@ -92,8 +92,14 @@ libi965_gen11_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=110 > > noinst_LTLIBRARIES = \ > libi965_dri.la \ > + libintel_tiled_memcpy.la \ > $(I965_PERGEN_LIBS) > > +libintel_tiled_memcpy_la_SOURCES = \ > + $(intel_tiled_memcpy_FILES) > +libintel_tiled_memcpy_la_CFLAGS = \ > + $(AM_CFLAGS) $(SSE41_CFLAGS) > + > libi965_dri_la_SOURCES = \ > $(i965_FILES) \ > $(i965_oa_GENERATED_FILES) > @@ -104,6 +110,7 @@ libi965_dri_la_LIBADD = \ > $(top_builddir)/src/intel/compiler/libintel_compiler.la \ > $(top_builddir)/src/intel/blorp/libblorp.la \ > $(I965_PERGEN_LIBS) \ > + libintel_tiled_memcpy.la > $(LIBDRM_LIBS)
Makes sense. > diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > index 7c6bde990d6..d076351b322 100644 > --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > @@ -36,6 +36,10 @@ > #include "brw_context.h" > #include "intel_tiled_memcpy.h" > > +#if defined(USE_SSE41) > +#include "main/streaming-load-memcpy.h" > +#include <smmintrin.h> > +#endif > #if defined(__SSSE3__) > #include <tmmintrin.h> > #elif defined(__SSE2__) > @@ -213,6 +217,30 @@ rgba8_copy_aligned_src(void *dst, const void *src, > size_t bytes) > return dst; > } > > +#if defined(USE_SSE41) > +static ALWAYS_INLINE void* Space in that void*? (but don't quote me on mesa/i965 preferred style!) > +_memcpy_streaming_load(void *dest, const void *src, size_t count) > +{ > + if (count == 16) { > + __m128i val = _mm_stream_load_si128((__m128i *)src); > + _mm_store_si128((__m128i *)dest, val); > + return dest; > + } else if (count == 64) { > + __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0); > + __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1); > + __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2); > + __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3); > + _mm_store_si128(((__m128i *)dest) + 0, val0); > + _mm_store_si128(((__m128i *)dest) + 1, val1); > + _mm_store_si128(((__m128i *)dest) + 2, val2); > + _mm_store_si128(((__m128i *)dest) + 3, val3); > + return dest; > + } else { assert(count < 16); or assert(count < 64) ? Might as well remind the reader (and caller?!) that this is only for copying the residuals. > + return memcpy(dest, src, count); > + } > +} > +#endif > + > /** > * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), > [x2,x3). > * These ranges are in bytes, i.e. pixels * bytes-per-pixel. > @@ -677,6 +705,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, > xtile_height, > dst, src, dst_pitch, swizzle_bit, > rgba8_copy, rgba8_copy_aligned_src); > +#if defined(USE_SSE41) > + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) > + return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, > xtile_height, > + dst, src, dst_pitch, swizzle_bit, memcpy, > + _memcpy_streaming_load); Please group memcpy and _memcpy_streaming_load (put the line brea before to keep them on the same line). > +#endif > else > unreachable("not reached"); > } else { > @@ -687,6 +721,12 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > return xtiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, > rgba8_copy, rgba8_copy_aligned_src); > +#if defined(USE_SSE41) > + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) > + return xtiled_to_linear(x0, x1, x2, x3, y0, y1, > + dst, src, dst_pitch, swizzle_bit, memcpy, > + _memcpy_streaming_load); > +#endif > else > unreachable("not reached"); > } > @@ -719,6 +759,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, > ytile_height, > dst, src, dst_pitch, swizzle_bit, > rgba8_copy, rgba8_copy_aligned_src); > +#if defined(USE_SSE41) > + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) > + return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, > ytile_height, > + dst, src, dst_pitch, swizzle_bit, > + memcpy, _memcpy_streaming_load); > +#endif > else > unreachable("not reached"); > } else { > @@ -729,6 +775,12 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, > uint32_t x2, uint32_t x3, > return ytiled_to_linear(x0, x1, x2, x3, y0, y1, > dst, src, dst_pitch, swizzle_bit, > rgba8_copy, rgba8_copy_aligned_src); > +#if defined(USE_SSE41) > + else if (mem_copy == (mem_copy_fn)_mesa_streaming_load_memcpy) > + return ytiled_to_linear(x0, x1, x2, x3, y0, y1, > + dst, src, dst_pitch, swizzle_bit, > + memcpy, _memcpy_streaming_load); > +#endif > else > unreachable("not reached"); > } Ok, was hoping to see how you choose to use the streaming load, but I guess that's the next patch. Reviewed-by: Chris Wilson <ch...@chris-wilson.co.uk> -Chris _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev