On date Friday 2015-05-29 09:47:58 -0700, Timothy Gu encoded: > On Fri, May 29, 2015 at 03:49:22PM +0200, Stefano Sabatini wrote: [...] > > OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ > > diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c > > new file mode 100644 > > index 0000000..8b3ed0f > > --- /dev/null > > +++ b/libavutil/x86/imgutils.c > > @@ -0,0 +1,95 @@ > > +/* > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU Lesser General Public > > + * License as published by the Free Software Foundation; either > > + * version 2.1 of the License, or (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + * Lesser General Public License for more details. > > + * > > + * You should have received a copy of the GNU Lesser General Public > > + * License along with FFmpeg; if not, write to the Free Software > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > > 02110-1301 USA > > + */ > > + > > +#include <inttypes.h> > > +#include "config.h" > > +#include "libavutil/avassert.h" > > +#include "libavutil/imgutils.h" > > +#include "libavutil/imgutils_internal.h" > > + > > +#if HAVE_SSE2 > > +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 > > instruction > > + * load and storing data with the SSE>=2 instruction store. > > + */ > > +#define COPY16(dstp, srcp, load, store) \ > > + __asm__ volatile ( \ > > + load " 0(%[src]), %%xmm1\n" \ > > + store " %%xmm1, 0(%[dst])\n" \ > > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") > > + > > +#define COPY64(dstp, srcp, load, store) \ > > + __asm__ volatile ( \ > > + load " 0(%[src]), %%xmm1\n" \ > > + load " 16(%[src]), %%xmm2\n" \ > > + load " 32(%[src]), %%xmm3\n" \ > > + load " 48(%[src]), %%xmm4\n" \ > > + store " %%xmm1, 0(%[dst])\n" \ > > + store " %%xmm2, 16(%[dst])\n" \ > > + store " %%xmm3, 32(%[dst])\n" \ > > + store " %%xmm4, 48(%[dst])\n" \ > > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", > > "xmm3", "xmm4") > > +#endif > > + > > +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, > > + const uint8_t *src, size_t src_linesize, > > + unsigned bytewidth, unsigned height, > > + int cpu_flags) > > +{ > > +#if !HAVE_SSSE3 >
> Are any SSSE3 instructions used? No. I re-checked, MOVDQA/MOVDQU were introduced in SSE2, MOVNTDQA in SSE4. > > + return av_image_copy_plane(dst, dst_linesize, src, src_linesize, > > bytewidth, height); > > +#endif > > + > > + av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0); > > + > > + __asm__ volatile ("mfence"); > > + > > + for (unsigned y = 0; y < height; y++) { > > + const unsigned unaligned = (-(uintptr_t)src) & 0x0f; > > + unsigned x = unaligned; > > + > > > +#if HAVE_SSE42 > > + if (cpu_flags & AV_CPU_FLAG_SSE4) { > > movntdqa is an SSE4.1 instruction, so this should work better: > > if (INLINE_SSE4(cpu_flags)) > > That checks both HAVE_SSE4_INLINE and cpu_flags for AV_CPU_FLAG_SSE4. > > (But then like others have said new inline asm code shouldn't be added in the > first place) Next step would be the use of YASM, but I only want to test if the general approach is fine (and if the API is not too specific). Also if someone wants to step up and port it to YASM I'm all for it, since ASM/YASM is far from being my area of expertise. -- FFmpeg = Fiendish Fabulous Most Pure Evangelical God
>From ec96aee1930247248a5e438171c120ea3f5dbbea Mon Sep 17 00:00:00 2001 From: Stefano Sabatini <stefa...@gmail.com> Date: Fri, 15 May 2015 18:58:17 +0200 Subject: [PATCH] lavu/imgutils: add av_image_copy_plane_from_uswc() function. This function allows support to optimized GPU to CPU. Based on code from vlc dxva2.c, commit 62107e56 by Laurent Aimar <fen...@videolan.org>. TODO: fix integration with the build system, update APIchanges and bump minor once ready --- libavutil/imgutils.c | 13 +++++ libavutil/imgutils.h | 18 ++++++ libavutil/imgutils_internal.h | 29 ++++++++++ libavutil/x86/Makefile | 1 + libavutil/x86/imgutils.c | 126 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 187 insertions(+) create mode 100644 libavutil/imgutils_internal.h create mode 100644 libavutil/x86/imgutils.c diff --git a/libavutil/imgutils.c b/libavutil/imgutils.c index ef0e671..59a0054 100644 --- a/libavutil/imgutils.c +++ b/libavutil/imgutils.c @@ -30,6 +30,7 @@ #include "mathematics.h" #include "pixdesc.h" #include "rational.h" +#include "imgutils_internal.h" void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], const AVPixFmtDescriptor *pixdesc) @@ -405,3 +406,15 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size, return size; } + +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height, + int cpu_flags) +{ +#if ARCH_X86 + ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, bytewidth, height, cpu_flags); +#else + av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height); +#endif +} diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h index 23282a3..184e1e7 100644 --- a/libavutil/imgutils.h +++ b/libavutil/imgutils.h @@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t *dst, int dst_linesize, int bytewidth, int height); /** + * Copy image plane from src to dst, similar to av_image_copy_plane(). + * src must be an USWC buffer. + * It performs optimized copy from "Uncacheable Speculative Write + * Combining" memory as used by some video surface. + * It is really efficient only when SSE4.1 is available. + * + * In case the target CPU does not support USWC caching this function + * will be equivalent to av_image_copy_plane(). + * + * @param cpu_flags as returned by av_get_cpu_flags() + * @see av_image_copy_plane() + */ +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height, + int cpu_flags); + +/** * Copy image in src_data to dst_data. * * @param dst_linesizes linesizes for the image in dst_data diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h new file mode 100644 index 0000000..9576afe --- /dev/null +++ b/libavutil/imgutils_internal.h @@ -0,0 +1,29 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_IMGUTILS_INTERNAL_H +#define AVUTIL_IMGUTILS_INTERNAL_H + +#include "imgutils.h" + +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height, + int cpu_flags); + +#endif /* AVUTIL_IMGUTILS_INTERNAL_H */ diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index eb70a62..a719c00 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -1,5 +1,6 @@ OBJS += x86/cpu.o \ x86/float_dsp_init.o \ + x86/imgutils.o \ x86/lls_init.o \ OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c new file mode 100644 index 0000000..1750f97 --- /dev/null +++ b/libavutil/x86/imgutils.c @@ -0,0 +1,126 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include "config.h" +#include "libavutil/avassert.h" +#include "libavutil/imgutils.h" +#include "libavutil/imgutils_internal.h" + +#if HAVE_SSE2 +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction + * load and storing data with the SSE>=2 instruction store. + */ +#define COPY16(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + store " %%xmm1, 0(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") + +#define COPY64(dstp, srcp, load, store) \ + __asm__ volatile ( \ + load " 0(%[src]), %%xmm1\n" \ + load " 16(%[src]), %%xmm2\n" \ + load " 32(%[src]), %%xmm3\n" \ + load " 48(%[src]), %%xmm4\n" \ + store " %%xmm1, 0(%[dst])\n" \ + store " %%xmm2, 16(%[dst])\n" \ + store " %%xmm3, 32(%[dst])\n" \ + store " %%xmm4, 48(%[dst])\n" \ + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4") +#endif + +#if HAVE_SSE2 +static void ff_image_copy_plane_from_uswc_sse2(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height) +{ + __asm__ volatile ("mfence"); + + av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0); + + for (unsigned y = 0; y < height; y++) { + const unsigned unaligned = (-(uintptr_t)src) & 0x0f; + unsigned x = unaligned; + + if (!unaligned) { + for (; x+63 < bytewidth; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movdqa"); + } else { + COPY16(dst, src, "movdqu", "movdqa"); + for (; x+63 < bytewidth; x += 64) + COPY64(&dst[x], &src[x], "movdqa", "movdqu"); + } + + for (; x < bytewidth; x++) + dst[x] = src[x]; + + src += src_linesize; + dst += dst_linesize; + } + __asm__ volatile ("mfence"); +} +#endif + +#if HAVE_SSE4 +static void ff_image_copy_plane_from_uswc_sse4(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height) +{ + __asm__ volatile ("mfence"); + + av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0); + + for (unsigned y = 0; y < height; y++) { + const unsigned unaligned = (-(uintptr_t)src) & 0x0f; + unsigned x = unaligned; + + if (!unaligned) { + for (; x+63 < bytewidth; x += 64) + COPY64(&dst[x], &src[x], "movntdqa", "movdqa"); + } else { + COPY16(dst, src, "movdqu", "movdqa"); + for (; x+63 < bytewidth; x += 64) + COPY64(&dst[x], &src[x], "movntdqa", "movdqu"); + } + + for (; x < bytewidth; x++) + dst[x] = src[x]; + + src += src_linesize; + dst += dst_linesize; + } + __asm__ volatile ("mfence"); +} +#endif + +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, + const uint8_t *src, size_t src_linesize, + unsigned bytewidth, unsigned height, + int cpu_flags) +{ +#if HAVE_SSE4 + if (cpu_flags & AV_CPU_FLAG_SSE4) + return ff_image_copy_plane_from_uswc_sse4(dst, dst_linesize, src, src_linesize, bytewidth, height); +#endif +#if HAVE_SSE2 + if (cpu_flags & AV_CPU_FLAG_SSE2) + return ff_image_copy_plane_from_uswc_sse2(dst, dst_linesize, src, src_linesize, bytewidth, height); +#endif + return av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, height); +} -- 1.9.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel