WARNING: No perf data, please keep reading though) This implements the suggestion provided by the paper, "Fast USWC to WB Memory Copy" (https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers). This is described throughout the paper, but the sample code lives in Figure 3-3. That paper purports a roughly 40% performance gain in Mbyte/second over the original implementation done by Matt.
Section 3.1.2 is the summary of why an intermediate cache buffer is used. It claims that if you use the naive implementation, fill buffers are contended for. To be honest, I can't quite fathom the underlying explanation, but I'll think about it some more. Most importantly would be to get the perf data... This patch does need performance data. I don't currently have a platform that this would benefit (BYT or BSW), so I can't get anything useful. As soon as I get a platform to test it on, I will - meanwhile, maybe whomever tested the original patch the first time around come run this through? Cc: Matt Turner <matts...@gmail.com> Cc: Chad Versace <chad.vers...@linux.intel.com> Cc: Kristian Høgsberg <k...@bitplanet.net> Signed-off-by: Ben Widawsky <b...@bwidawsk.net> --- src/mesa/main/streaming-load-memcpy.c | 61 +++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/src/mesa/main/streaming-load-memcpy.c b/src/mesa/main/streaming-load-memcpy.c index d7147af..3cd310a 100644 --- a/src/mesa/main/streaming-load-memcpy.c +++ b/src/mesa/main/streaming-load-memcpy.c @@ -30,6 +30,8 @@ #include "main/streaming-load-memcpy.h" #include <smmintrin.h> +static uint8_t rsvd_space[4096]; + /* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming * read performance from uncached memory. */ @@ -59,23 +61,54 @@ _mesa_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) len -= MIN2(bytes_before_alignment_boundary, len); } - while (len >= 64) { - __m128i *dst_cacheline = (__m128i *)d; - __m128i *src_cacheline = (__m128i *)s; + while (len > 64) { + __m128i *cached_buffer = (__m128i *)rsvd_space; + size_t streaming_len = len > 4096 ? 4096 : len; + + __asm__ volatile("mfence" ::: "memory"); + + while (streaming_len >= 64) { + __m128i *src_cacheline = (__m128i *)s; + + __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); + __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); + __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); + __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3); + + _mm_store_si128(cached_buffer + 0, temp1); + _mm_store_si128(cached_buffer + 1, temp2); + _mm_store_si128(cached_buffer + 2, temp3); + _mm_store_si128(cached_buffer + 3, temp4); + + s += 64; + streaming_len -= 64; + cached_buffer += 4; + } + + cached_buffer = (__m128i *)rsvd_space; + streaming_len = len > 4096 ? 4096 : len; + + __asm__ volatile("mfence" ::: "memory"); + + while (streaming_len >= 64) { + __m128i *dst_cacheline = (__m128i *)d; + + __m128i temp1 = _mm_stream_load_si128(cached_buffer + 0); + __m128i temp2 = _mm_stream_load_si128(cached_buffer + 1); + __m128i temp3 = _mm_stream_load_si128(cached_buffer + 2); + __m128i temp4 = _mm_stream_load_si128(cached_buffer + 3); - __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); - __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); - __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); - __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3); + _mm_store_si128(dst_cacheline + 0, temp1); + _mm_store_si128(dst_cacheline + 1, temp2); + _mm_store_si128(dst_cacheline + 2, temp3); + _mm_store_si128(dst_cacheline + 3, temp4); - _mm_store_si128(dst_cacheline + 0, temp1); - _mm_store_si128(dst_cacheline + 1, temp2); - _mm_store_si128(dst_cacheline + 2, temp3); - _mm_store_si128(dst_cacheline + 3, temp4); + d += 64; + streaming_len -= 64; + cached_buffer += 4; - d += 64; - s += 64; - len -= 64; + len -= 64; + } } /* memcpy() the tail. */ -- 2.4.5 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev