On date Tuesday 2015-06-16 10:20:31 +0200, wm4 encoded: > On Mon, 15 Jun 2015 17:55:35 +0200 > Stefano Sabatini <stefa...@gmail.com> wrote: > > > On date Monday 2015-06-15 11:56:13 +0200, Stefano Sabatini encoded: > > [...] > > > From 3a75ef1e86360cd6f30b8e550307404d0d1c1dba Mon Sep 17 00:00:00 2001 > > > From: Stefano Sabatini <stefa...@gmail.com> > > > Date: Mon, 15 Jun 2015 11:02:50 +0200 > > > Subject: [PATCH] lavu/mem: add av_memcpynt() function with x86 > > > optimizations > > > > > > Assembly based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar > > > <fen...@videolan.org>. > > > > > > TODO: bump minor, update APIchanges > > > --- > > > libavutil/mem.c | 9 +++++ > > > libavutil/mem.h | 14 ++++++++ > > > libavutil/mem_internal.h | 26 +++++++++++++++ > > > libavutil/x86/Makefile | 1 + > > > libavutil/x86/mem.c | 85 > > > ++++++++++++++++++++++++++++++++++++++++++++++++ > > > 5 files changed, 135 insertions(+) > > > create mode 100644 libavutil/mem_internal.h > > > create mode 100644 libavutil/x86/mem.c > > > > > > diff --git a/libavutil/mem.c b/libavutil/mem.c > > > index da291fb..0e1eb01 100644 > > > --- a/libavutil/mem.c > > > +++ b/libavutil/mem.c > > > @@ -42,6 +42,7 @@ > > > #include "dynarray.h" > > > #include "intreadwrite.h" > > > #include "mem.h" > > > +#include "mem_internal.h" > > > > > > #ifdef MALLOC_PREFIX > > > > > > @@ -515,3 +516,11 @@ void av_fast_malloc(void *ptr, unsigned int *size, > > > size_t min_size) > > > ff_fast_malloc(ptr, size, min_size, 0); > > > } > > > > > > +void av_memcpynt(void *dst, const void *src, size_t size, int cpu_flags) > > > +{ > > > +#if ARCH_X86 > > > + ff_memcpynt_x86(dst, src, size, cpu_flags); > > > +#else > > > + memcpy(dst, src, size, cpu_flags); > > > +#endif > > > +} > > > > Alternatively, what about something like: > > > > av_memcpynt_fn av_memcpynt_get_fn(void); > > > > modeled after av_pixelutils_get_sad_fn()? This would skip the need for > > a wrapper calling the right function. >
> I don't see much value in this, unless determining the right function > causes too much overhead. I see two advantages, 1. no branch and function call when the function is called, 2. the cpu_flags must not be passed around, so it's somehow safer. I have no strong preference though, updated (untested patch) in attachment. -- FFmpeg = Fierce and Forgiving Merciless Powered Extroverse Gargoyle
>From c005ff5405dd48e6b0fed24ed94947f69bfe2783 Mon Sep 17 00:00:00 2001 From: Stefano Sabatini <stefa...@gmail.com> Date: Mon, 15 Jun 2015 11:02:50 +0200 Subject: [PATCH] lavu/mem: add av_memcpynt_get_fn() Assembly based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar <fen...@videolan.org>. TODO: remove use of inline assembly, bump minor, update APIchanges --- libavutil/mem.c | 9 +++++ libavutil/mem.h | 13 +++++++ libavutil/mem_internal.h | 26 +++++++++++++ libavutil/x86/Makefile | 1 + libavutil/x86/mem.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+) create mode 100644 libavutil/mem_internal.h create mode 100644 libavutil/x86/mem.c diff --git a/libavutil/mem.c b/libavutil/mem.c index da291fb..325bfc9 100644 --- a/libavutil/mem.c +++ b/libavutil/mem.c @@ -42,6 +42,7 @@ #include "dynarray.h" #include "intreadwrite.h" #include "mem.h" +#include "mem_internal.h" #ifdef MALLOC_PREFIX @@ -515,3 +516,11 @@ void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size) ff_fast_malloc(ptr, size, min_size, 0); } +av_memcpynt_fn av_memcpynt_get_fn(void) +{ +#if ARCH_X86 + return ff_memcpynt_get_fn_x86(); +#else + return memcpy; +#endif +} diff --git a/libavutil/mem.h b/libavutil/mem.h index 2a1e36d..d9f1b7a 100644 --- a/libavutil/mem.h +++ b/libavutil/mem.h @@ -382,6 +382,19 @@ void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size); */ void av_fast_malloc(void *ptr, unsigned int *size, size_t min_size); +typedef void* (*av_memcpynt_fn)(void *dst, const void *src, size_t size); + +/** + * Return possibly optimized function to copy size bytes from from src + * to dst, using non-temporal copy. + * + * The returned function works as memcpy, but adopts non-temporal + * instructios when available. This can lead to better performances + * when transferring data from source to destination is expensive, for + * example when reading from GPU memory. + */ +av_memcpynt_fn av_memcpynt_get_fn(void); + /** * @} */ diff --git a/libavutil/mem_internal.h b/libavutil/mem_internal.h new file mode 100644 index 0000000..de61cba --- /dev/null +++ b/libavutil/mem_internal.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_MEM_INTERNAL_H +#define AVUTIL_MEM_INTERNAL_H + +#include "mem.h" + +av_memcpynt_fn ff_memcpynt_get_fn_x86(void); + +#endif /* AVUTIL_MEM_INTERNAL_H */ diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index a719c00..171c351 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -2,6 +2,7 @@ OBJS += x86/cpu.o \ x86/float_dsp_init.o \ x86/imgutils.o \ x86/lls_init.o \ + x86/mem.o \ OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ diff --git a/libavutil/x86/mem.c b/libavutil/x86/mem.c new file mode 100644 index 0000000..6326c90 --- /dev/null +++ b/libavutil/x86/mem.c @@ -0,0 +1,98 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/mem_internal.h" + +#if HAVE_SSE2 +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction + * load and storing data with the SSE>=2 instruction store. + */ +#define COPY16(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + store " %%xmm1, 0(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") + +#define COPY64(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + load " 16(%[src]), %%xmm2\n" \ + load " 32(%[src]), %%xmm3\n" \ + load " 48(%[src]), %%xmm4\n" \ + store " %%xmm1, 0(%[dst])\n" \ + store " %%xmm2, 16(%[dst])\n" \ + store " %%xmm3, 32(%[dst])\n" \ + store " %%xmm4, 48(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4") +#endif + +#define COPY_LINE(dstp, srcp, size, load) \ + const unsigned unaligned = (-(uintptr_t)srcp) & 0x0f; \ + unsigned x = unaligned; \ + \ + av_assert0(((intptr_t)dstp & 0x0f) == 0); \ + \ + __asm__ volatile ("mfence"); \ + if (!unaligned) { \ + for (; x+63 < size; x += 64) \ + COPY64(&dstp[x], &srcp[x], load, "movdqa"); \ + } else { \ + COPY16(dst, src, "movdqu", "movdqa"); \ + for (; x+63 < size; x += 64) \ + COPY64(&dstp[x], &srcp[x], load, "movdqu"); \ + } \ + \ + for (; x < size; x++) \ + dstp[x] = srcp[x]; \ + __asm__ volatile ("mfence"); + +static void* ff_memcpynt_sse4(void *dst, const void *src, size_t size) +{ + uint8_t *dstu = dst; + const uint8_t *srcu = src; + + COPY_LINE(dstu, srcu, size, "movntdqa"); + return dst; +} + +static void* ff_memcpynt_sse2(void *dst, const void *src, size_t size) +{ + uint8_t *dstu = dst; + const uint8_t *srcu = src; + + COPY_LINE(dstu, srcu, size, "movntdqa"); + return dst; +} + +av_memcpynt_fn ff_memcpynt_get_fn_x86() +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_SSE4 + if (cpu_flags & AV_CPU_FLAG_SSE4) + return ff_memcpynt_sse4; +#endif +#if HAVE_SSE2 + if (cpu_flags & AV_CPU_FLAG_SSE2) + return ff_memcpynt_sse2; +#endif + return memcpy; +} -- 1.9.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel