On Fri, May 29, 2015 at 03:49:22PM +0200, Stefano Sabatini wrote: > @@ -405,3 +406,16 @@ int av_image_copy_to_buffer(uint8_t *dst, int dst_size, > > return size; > } > + > +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + int cpu_flags) > +{ > +#if !HAVE_SSSE3
> + av_unused(cpu_flags); av_used has a different definition than VLC_UNUSED. Just use a (void) cast. > + av_image_copy_plane(dst, dst_linesize, src, src_linesize, bytewidth, > height); > +#else > + ff_image_copy_plane_from_uswc_x86(dst, dst_linesize, src, src_linesize, > bytewidth, height, cpu_flags); > +#endif > +} > diff --git a/libavutil/imgutils.h b/libavutil/imgutils.h > index 23282a3..184e1e7 100644 > --- a/libavutil/imgutils.h > +++ b/libavutil/imgutils.h > @@ -111,6 +111,24 @@ void av_image_copy_plane(uint8_t *dst, int > dst_linesize, > int bytewidth, int height); > > /** > + * Copy image plane from src to dst, similar to av_image_copy_plane(). > + * src must be an USWC buffer. > + * It performs optimized copy from "Uncacheable Speculative Write > + * Combining" memory as used by some video surface. > + * It is really efficient only when SSE4.1 is available. > + * > + * In case the target CPU does not support USWC caching this function > + * will be equivalent to av_image_copy_plane(). > + * > + * @param cpu_flags as returned by av_get_cpu_flags() > + * @see av_image_copy_plane() > + */ > +void av_image_copy_plane_from_uswc(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + int cpu_flags); > + > +/** > * Copy image in src_data to dst_data. > * > * @param dst_linesizes linesizes for the image in dst_data > diff --git a/libavutil/imgutils_internal.h b/libavutil/imgutils_internal.h > new file mode 100644 > index 0000000..9576afe > --- /dev/null > +++ b/libavutil/imgutils_internal.h > @@ -0,0 +1,29 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVUTIL_IMGUTILS_INTERNAL_H > +#define AVUTIL_IMGUTILS_INTERNAL_H > + > +#include "imgutils.h" > + > +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + int cpu_flags); > + > +#endif /* AVUTIL_IMGUTILS_INTERNAL_H */ > diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile > index eb70a62..a719c00 100644 > --- a/libavutil/x86/Makefile > +++ b/libavutil/x86/Makefile > @@ -1,5 +1,6 @@ > OBJS += x86/cpu.o \ > x86/float_dsp_init.o \ > + x86/imgutils.o \ > x86/lls_init.o \ > > OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils_init.o \ > diff --git a/libavutil/x86/imgutils.c b/libavutil/x86/imgutils.c > new file mode 100644 > index 0000000..8b3ed0f > --- /dev/null > +++ b/libavutil/x86/imgutils.c > @@ -0,0 +1,95 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include <inttypes.h> > +#include "config.h" > +#include "libavutil/avassert.h" > +#include "libavutil/imgutils.h" > +#include "libavutil/imgutils_internal.h" > + > +#if HAVE_SSE2 > +/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 > instruction > + * load and storing data with the SSE>=2 instruction store. > + */ > +#define COPY16(dstp, srcp, load, store) \ > + __asm__ volatile ( \ > + load " 0(%[src]), %%xmm1\n" \ > + store " %%xmm1, 0(%[dst])\n" \ > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1") > + > +#define COPY64(dstp, srcp, load, store) \ > + __asm__ volatile ( \ > + load " 0(%[src]), %%xmm1\n" \ > + load " 16(%[src]), %%xmm2\n" \ > + load " 32(%[src]), %%xmm3\n" \ > + load " 48(%[src]), %%xmm4\n" \ > + store " %%xmm1, 0(%[dst])\n" \ > + store " %%xmm2, 16(%[dst])\n" \ > + store " %%xmm3, 32(%[dst])\n" \ > + store " %%xmm4, 48(%[dst])\n" \ > + : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", > "xmm3", "xmm4") > +#endif > + > +void ff_image_copy_plane_from_uswc_x86(uint8_t *dst, size_t dst_linesize, > + const uint8_t *src, size_t src_linesize, > + unsigned bytewidth, unsigned height, > + int cpu_flags) > +{ > +#if !HAVE_SSSE3 Are any SSSE3 instructions used? > + return av_image_copy_plane(dst, dst_linesize, src, src_linesize, > bytewidth, height); > +#endif > + > + av_assert0(((intptr_t)dst & 0x0f) == 0 && (dst_linesize & 0x0f) == 0); > + > + __asm__ volatile ("mfence"); > + > + for (unsigned y = 0; y < height; y++) { > + const unsigned unaligned = (-(uintptr_t)src) & 0x0f; > + unsigned x = unaligned; > + > +#if HAVE_SSE42 > + if (cpu_flags & AV_CPU_FLAG_SSE4) { movntdqa is an SSE4.1 instruction, so this should work better: if (INLINE_SSE4(cpu_flags)) That checks both HAVE_SSE4_INLINE and cpu_flags for AV_CPU_FLAG_SSE4. (But then like others have said new inline asm code shouldn't be added in the first place) Timothy _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel