Shouldn't this stuff be in src/util? Marek
On Wed, Jul 8, 2015 at 11:07 PM, Ben Widawsky <benjamin.widaw...@intel.com> wrote: > WARNING: No perf data, please keep reading though) > > This implements the suggestion provided by the paper, "Fast USWC to WB Memory > Copy" > (https://software.intel.com/en-us/articles/copying-accelerated-video-decode-frame-buffers). > This is described throughout the paper, but the sample code lives in Figure > 3-3. > That paper purports a roughly 40% performance gain in Mbyte/second over the > original implementation done by Matt. > > Section 3.1.2 is the summary of why an intermediate cache buffer is used. It > claims that if you use the naive implementation, fill buffers are contended > for. > To be honest, I can't quite fathom the underlying explanation, but I'll think > about it some more. Most importantly would be to get the perf data... This > patch > does need performance data. I don't currently have a platform that this would > benefit (BYT or BSW), so I can't get anything useful. As soon as I get a > platform to test it on, I will - meanwhile, maybe whomever tested the original > patch the first time around come run this through? > > Cc: Matt Turner <matts...@gmail.com> > Cc: Chad Versace <chad.vers...@linux.intel.com> > Cc: Kristian Høgsberg <k...@bitplanet.net> > Signed-off-by: Ben Widawsky <b...@bwidawsk.net> > --- > src/mesa/main/streaming-load-memcpy.c | 61 > +++++++++++++++++++++++++++-------- > 1 file changed, 47 insertions(+), 14 deletions(-) > > diff --git a/src/mesa/main/streaming-load-memcpy.c > b/src/mesa/main/streaming-load-memcpy.c > index d7147af..3cd310a 100644 > --- a/src/mesa/main/streaming-load-memcpy.c > +++ b/src/mesa/main/streaming-load-memcpy.c > @@ -30,6 +30,8 @@ > #include "main/streaming-load-memcpy.h" > #include <smmintrin.h> > > +static uint8_t rsvd_space[4096]; > + > /* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming > * read performance from uncached memory. > */ > @@ -59,23 +61,54 @@ _mesa_streaming_load_memcpy(void *restrict dst, void > *restrict src, size_t len) > len -= MIN2(bytes_before_alignment_boundary, len); > } > > - while (len >= 64) { > - __m128i *dst_cacheline = (__m128i *)d; > - __m128i *src_cacheline = (__m128i *)s; > + while (len > 64) { > + __m128i *cached_buffer = (__m128i *)rsvd_space; > + size_t streaming_len = len > 4096 ? 4096 : len; > + > + __asm__ volatile("mfence" ::: "memory"); > + > + while (streaming_len >= 64) { > + __m128i *src_cacheline = (__m128i *)s; > + > + __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); > + __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); > + __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); > + __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3); > + > + _mm_store_si128(cached_buffer + 0, temp1); > + _mm_store_si128(cached_buffer + 1, temp2); > + _mm_store_si128(cached_buffer + 2, temp3); > + _mm_store_si128(cached_buffer + 3, temp4); > + > + s += 64; > + streaming_len -= 64; > + cached_buffer += 4; > + } > + > + cached_buffer = (__m128i *)rsvd_space; > + streaming_len = len > 4096 ? 4096 : len; > + > + __asm__ volatile("mfence" ::: "memory"); > + > + while (streaming_len >= 64) { > + __m128i *dst_cacheline = (__m128i *)d; > + > + __m128i temp1 = _mm_stream_load_si128(cached_buffer + 0); > + __m128i temp2 = _mm_stream_load_si128(cached_buffer + 1); > + __m128i temp3 = _mm_stream_load_si128(cached_buffer + 2); > + __m128i temp4 = _mm_stream_load_si128(cached_buffer + 3); > > - __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); > - __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); > - __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); > - __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3); > + _mm_store_si128(dst_cacheline + 0, temp1); > + _mm_store_si128(dst_cacheline + 1, temp2); > + _mm_store_si128(dst_cacheline + 2, temp3); > + _mm_store_si128(dst_cacheline + 3, temp4); > > - _mm_store_si128(dst_cacheline + 0, temp1); > - _mm_store_si128(dst_cacheline + 1, temp2); > - _mm_store_si128(dst_cacheline + 2, temp3); > - _mm_store_si128(dst_cacheline + 3, temp4); > + d += 64; > + streaming_len -= 64; > + cached_buffer += 4; > > - d += 64; > - s += 64; > - len -= 64; > + len -= 64; > + } > } > > /* memcpy() the tail. */ > -- > 2.4.5 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev