On 25 September 2018 at 16:27, Jun Zhao <mypopy...@gmail.com> wrote: > Add alpha blending API based on row, support global alpha blending/ > per-pixel blending, and add SSSE3/AVX2 optimizations of the functions. > > Signed-off-by: Jun Zhao <mypopy...@gmail.com> > --- > libavutil/Makefile | 2 + > libavutil/blend.c | 101 ++++++++++++ > libavutil/blend.h | 47 ++++++ > libavutil/x86/Makefile | 3 +- > libavutil/x86/blend.h | 32 ++++ > libavutil/x86/blend_init.c | 369 ++++++++++++++++++++++++++++++ > ++++++++++++++ > 6 files changed, 553 insertions(+), 1 deletions(-) > create mode 100644 libavutil/blend.c > create mode 100644 libavutil/blend.h > create mode 100644 libavutil/x86/blend.h > create mode 100644 libavutil/x86/blend_init.c > > diff --git a/libavutil/Makefile b/libavutil/Makefile > index 9ed24cf..f1c06e4 100644 > --- a/libavutil/Makefile > +++ b/libavutil/Makefile > @@ -10,6 +10,7 @@ HEADERS = adler32.h > \ > avstring.h \ > avutil.h \ > base64.h \ > + blend.h \ > blowfish.h \ > bprint.h \ > bswap.h \ > @@ -95,6 +96,7 @@ OBJS = adler32.o > \ > audio_fifo.o \ > avstring.o \ > base64.o \ > + blend.o \ > blowfish.o \ > bprint.o \ > buffer.o \ > diff --git a/libavutil/blend.c b/libavutil/blend.c > new file mode 100644 > index 0000000..e28efa0 > --- /dev/null > +++ b/libavutil/blend.c > @@ -0,0 +1,101 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/blend.h" > + > +#include "libavutil/x86/blend.h" > + > +static void ff_global_blend_row_c(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, /* XXX: only use > alpha[0] */ > + uint8_t *dst, > + int width) > +{ > + int x; > + for (x = 0; x < width - 1; x += 2) { > + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) > >> 8; > + dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) > >> 8; > + src0 += 2; > + src1 += 2; > + dst += 2; > + } > + if (width & 1) { > + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) > >> 8; > + } > +} > + > +void av_global_blend_row(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + blend_row blend_row_fn = NULL; > + > +#if ARCH_X86 > + blend_row_fn = ff_blend_row_init_x86(1); > +#endif > + > + if (!blend_row_fn) > + blend_row_fn = ff_global_blend_row_c; > + > + blend_row_fn(src0, src1, alpha, dst, width); > +} > + > +static void ff_per_pixel_blend_row_c(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + int x; > + for (x = 0; x < width - 1; x += 2) { > + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) > >> 8; > + dst[1] = (src0[1] * alpha[0] + src1[1] * (255 - alpha[0]) + 255) > >> 8; > + src0 += 2; > + src1 += 2; > + dst += 2; > + alpha+= 2; > + } > + if (width & 1) { > + dst[0] = (src0[0] * alpha[0] + src1[0] * (255 - alpha[0]) + 255) > >> 8; > + } > +} > + > +void av_per_pixel_blend_row(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + blend_row blend_row_fn = NULL; > + > +#if ARCH_X86 > + blend_row_fn = ff_blend_row_init_x86(0); > +#endif > + > + if (!blend_row_fn) > + blend_row_fn = ff_per_pixel_blend_row_c; > + > + blend_row_fn(src0, src1, alpha, dst, width); > +} > + > diff --git a/libavutil/blend.h b/libavutil/blend.h > new file mode 100644 > index 0000000..8a42109 > --- /dev/null > +++ b/libavutil/blend.h > @@ -0,0 +1,47 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + */ > +#ifndef AVUTIL_BLEND_H > +#define AVUTIL_BLEND_H > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/asm.h" > + > +/** > + * Global alpha blending by row > + * > + * dst[i] = (src[i]*alpha[0]+(255-alpha[0])*src1[i]+255)>>8 > + */ > +void av_global_blend_row(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, /* XXX: only use alpha[0] > */ > + uint8_t *dst, > + int width); > + > +/** > + * Per-pixel alpha blending by row > + * > + * dst[i] = (src[i]*alpha[i]+(255-alpha[i])*src1[i]+255)>>8 > + */ > +void av_per_pixel_blend_row(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width); > +#endif > diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile > index 5f5242b..1e5e3e4 100644 > --- a/libavutil/x86/Makefile > +++ b/libavutil/x86/Makefile > @@ -1,4 +1,5 @@ > -OBJS += x86/cpu.o \ > +OBJS += x86/blend_init.o \ > + x86/cpu.o \ > x86/fixed_dsp_init.o \ > x86/float_dsp_init.o \ > x86/imgutils_init.o \ > diff --git a/libavutil/x86/blend.h b/libavutil/x86/blend.h > new file mode 100644 > index 0000000..9fa0f36 > --- /dev/null > +++ b/libavutil/x86/blend.h > @@ -0,0 +1,32 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#ifndef AVUTIL_X86_BLEND_H > +#define AVUTIL_X86_BLEND_H > + > +#include "libavutil/blend.h" > + > +typedef void (*blend_row)(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width); > + > +blend_row ff_blend_row_init_x86(int global); > + > +#endif /* AVUTIL_X86_BLEND_H */ > diff --git a/libavutil/x86/blend_init.c b/libavutil/x86/blend_init.c > new file mode 100644 > index 0000000..f555dfa > --- /dev/null > +++ b/libavutil/x86/blend_init.c > @@ -0,0 +1,369 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with FFmpeg; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + */ > + > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/cpu.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/blend.h" > + > +#if HAVE_SSSE3_INLINE && HAVE_6REGS > +// per-pixel blend (8 pixels at a time.) > +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256 > +static void ff_per_pixel_blend_row_ssse3(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + int aligned_w = width/8 * 8; > + int width_u = width - aligned_w; > + uint8_t *src0_u = (uint8_t *)src0 + aligned_w; > + uint8_t *src1_u = (uint8_t *)src1 + aligned_w; > + uint8_t *alpha_u = (uint8_t *)alpha + aligned_w; > + uint8_t *dst_u = dst + aligned_w; > + int i; > + > + if (aligned_w > 0) { > + __asm__ volatile( > + "pcmpeqb %%xmm3,%%xmm3 \n\t" > + "psllw $0x8,%%xmm3 \n\t" > + "mov $0x80808080,%%eax \n\t" > + "movd %%eax,%%xmm3 \n\t" > + "pshufd $0x0,%%xmm4,%%xmm4 \n\t" > + "mov $0x807f807f,%%eax \n\t" > + "movd %%eax,%%xmm5 \n\t" > + "pshufd $0x0,%%xmm5,%%xmm5 \n\t" > + "sub %2,%0 \n\t" > + "sub %2,%1 \n\t" > + "sub %2,%3 \n\t" > + > + // 8 pixel per loop. > + "1: \n\t" > + "movq (%2),%%xmm0 \n\t" > + "punpcklbw %%xmm0,%%xmm0 \n\t" > + "pxor %%xmm3,%%xmm0 \n\t" > + "movq (%0,%2,1),%%xmm1 \n\t" > + "movq (%1,%2,1),%%xmm2 \n\t" > + "punpcklbw %%xmm2,%%xmm1 \n\t" > + "psubb %%xmm4,%%xmm1 \n\t" > + "pmaddubsw %%xmm1,%%xmm0 \n\t" > + "paddw %%xmm5,%%xmm0 \n\t" > + "psrlw $0x8,%%xmm0 \n\t" > + "packuswb %%xmm0,%%xmm0 \n\t" > + "movq %%xmm0,(%3,%2,1) \n\t" > + "lea 0x8(%2),%2 \n\t" > + "sub $0x8,%4 \n\t" > + "jg 1b \n\t" > + : "+r"(src0), // %0 > + "+r"(src1), // %1 > + "+r"(alpha), // %2 > + "+r"(dst), // %3 > + "+rm"(aligned_w) // %4 > + ::"memory", > + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); > + } > + > + for (i = 0; i < width_u - 1; i += 2) { > + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - > alpha_u[0]) + 255) >> 8; > + dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - > alpha_u[0]) + 255) >> 8; > + src0_u += 2; > + src1_u += 2; > + dst_u += 2; > + alpha_u+= 2; > + } > + if (width_u & 1) { > + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - > alpha_u[0]) + 255) >> 8; > + } > +} > + > +// global blend (8 pixels at a time). > +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256 > +static void ff_global_blend_row_ssse3(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + int aligned_w = width/8 * 8; > + int width_u = width - aligned_w; > + uint8_t *src0_u = (uint8_t *)src0 + aligned_w; > + uint8_t *src1_u = (uint8_t *)src1 + aligned_w; > + uint8_t *dst_u = dst + aligned_w; > + int i; > + > + if (aligned_w > 0) { > + __asm__ volatile( > + "pcmpeqb %%xmm3,%%xmm3 \n\t" > + "psllw $0x8,%%xmm3 \n\t" > + "mov $0x80808080,%%eax \n\t" > + "movd %%eax,%%xmm4 \n\t" > + "pshufd $0x0,%%xmm4,%%xmm4 \n\t" > + "mov $0x807f807f,%%eax \n\t" > + "movd %%eax,%%xmm5 \n\t" > + "pshufd $0x0,%%xmm5,%%xmm5 \n\t" > + // a => xmm6 [a a a a a a a a a a a a a a a a ] > + "movb (%2),%%al \n\t" > + "movd %%eax,%%xmm6 \n\t" // xmm6 = x > x x x x x x x x x x x x x x a > + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x > x x x x x x x x x x x x x a a > + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x > x x x x x x x x x x x a a a a > + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = x > x x x x x x x a a a a a a a a > + "punpcklbw %%xmm6,%%xmm6 \n\t" // xmm6 = a > a a a a a a a a a a a a a a a > + > + // 8 pixel per loop. > + "1: \n\t" > + "movdqu %%xmm6,%%xmm0 \n\t" // xmm0 = > xmm6 > + "pxor %%xmm3,%%xmm0 \n\t" > + > + "movq (%0),%%xmm1 \n\t" > + "movq (%1),%%xmm2 \n\t" > + "punpcklbw %%xmm2,%%xmm1 \n\t" > + "psubb %%xmm4,%%xmm1 \n\t" > + > + "pmaddubsw %%xmm1,%%xmm0 \n\t" > + "paddw %%xmm5,%%xmm0 \n\t" > + "psrlw $0x8,%%xmm0 \n\t" > + "packuswb %%xmm0,%%xmm0 \n\t" > + "movq %%xmm0,(%3) \n\t" > + > + "lea 0x8(%0),%0 \n\t" // src0+8 > + "lea 0x8(%1),%1 \n\t" // src1+8 > + "lea 0x8(%3),%3 \n\t" // dst+8 > + "sub $0x8,%4 \n\t" > + "jg 1b \n\t" > + : "+r"(src0), // %0 > + "+r"(src1), // %1 > + "+r"(alpha), // %2 > + "+r"(dst), // %3 > + "+rm"(aligned_w) // %4 > + ::"memory", > + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", > "xmm6"); > + } > + > + for (i = 0; i < width_u - 1; i += 2) { > + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + > 255) >> 8; > + dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + > 255) >> 8; > + src0_u += 2; > + src1_u += 2; > + dst_u += 2; > + } > + if (width_u & 1) { > + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + > 255) >> 8; > + } > +} > +#endif > + > +#if HAVE_AVX2_INLINE && HAVE_6REGS > +// per-pixe blend (32 pixels at a time). > +// dst[i] = ((src0[i]*alpah[i])+(src1[i]*(255-alpha[i]))+255)/256 > +static void ff_per_pixel_blend_row_avx2(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + int aligned_w = width/32 * 32; > + int width_u = width - aligned_w; > + uint8_t *src0_u = (uint8_t *)src0 + aligned_w; > + uint8_t *src1_u = (uint8_t *)src1 + aligned_w; > + uint8_t *alpha_u = (uint8_t *)alpha + aligned_w; > + uint8_t *dst_u = dst + aligned_w; > + int i; > + > + if (aligned_w > 0) { > + __asm__ volatile( > + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t" > + "vpsllw $0x8,%%ymm5,%%ymm5 \n\t" > + "mov $0x80808080,%%eax \n\t" > + "vmovd %%eax,%%xmm6 \n\t" > + "vbroadcastss %%xmm6,%%ymm6 \n\t" > + "mov $0x807f807f,%%eax \n\t" > + "vmovd %%eax,%%xmm7 \n\t" > + "vbroadcastss %%xmm7,%%ymm7 \n\t" > + "sub %2,%0 \n\t" > + "sub %2,%1 \n\t" > + "sub %2,%3 \n\t" > + > + // 32 pixel per loop. > + "1: \n\t" > + "vmovdqu (%2),%%ymm0 \n\t" > + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t" > + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t" > + "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t" > + "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t" > + "vmovdqu (%0,%2,1),%%ymm1 \n\t" > + "vmovdqu (%1,%2,1),%%ymm2 \n\t" > + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t" > + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t" > + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t" > + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t" > + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t" > + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t" > + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t" > + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t" > + "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t" > + "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t" > + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t" > + "vmovdqu %%ymm0,(%3,%2,1) \n\t" > + "lea 0x20(%2),%2 \n\t" > + "sub $0x20,%4 \n\t" > + "jg 1b \n\t" > + "vzeroupper \n\t" > + : "+r"(src0), // %0 > + "+r"(src1), // %1 > + "+r"(alpha), // %2 > + "+r"(dst), // %3 > + "+rm"(aligned_w) // %4 > + ::"memory", > + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", > "xmm6", > + "xmm7"); > + } > + > + for (i = 0; i < width_u - 1; i += 2) { > + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - > alpha_u[0]) + 255) >> 8; > + dst_u[1] = (src0_u[1] * alpha_u[0] + src1_u[1] * (255 - > alpha_u[0]) + 255) >> 8; > + src0_u += 2; > + src1_u += 2; > + dst_u += 2; > + alpha_u+= 2; > + } > + if (width_u & 1) { > + dst_u[0] = (src0_u[0] * alpha_u[0] + src1_u[0] * (255 - > alpha_u[0]) + 255) >> 8; > + } > +} > + > +// global blend (32 pixels at a time) > +// dst[i] = ((src0[i]*alpah[0])+(src1[i]*(255-alpha[0]))+255)/256 > +static void ff_global_blend_row_avx2(const uint8_t *src0, > + const uint8_t *src1, > + const uint8_t *alpha, > + uint8_t *dst, > + int width) > +{ > + int aligned_w = width/32 * 32; > + int width_u = width - aligned_w; > + uint8_t *src0_u = (uint8_t *)src0 + aligned_w; > + uint8_t *src1_u = (uint8_t *)src1 + aligned_w; > + uint8_t *dst_u = dst + aligned_w; > + int i; > + > + if (aligned_w > 0) { > + __asm__ volatile( > + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n\t" > + "vpsllw $0x8,%%ymm5,%%ymm5 \n\t" > + "mov $0x80808080,%%eax \n\t" > + "vmovd %%eax,%%xmm6 \n\t" > + "vbroadcastss %%xmm6,%%ymm6 \n\t" > + "mov $0x807f807f,%%eax \n\t" > + "vmovd %%eax,%%xmm7 \n\t" > + "vbroadcastss %%xmm7,%%ymm7 \n\t" > + // a => ymm8 [a a a a a a a a a a a a a a a a > + // a a a a a a a a a a a a a a a a > + // a a a a a a a a a a a a a a a a > + // a a a a a a a a a a a a a a a a] > + "movb (%2),%%al \n\t" > + "movd %%eax,%%xmm8 \n\t" // xmm8 = x > x x x x x x x x x x x x x x a > + "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x > x x x x x x x x x x x x x a a > + "punpcklbw %%xmm8,%%xmm8 \n\t" // xmm8 = x > x x x x x x x x x x x a a a a > + "vbroadcastss %%xmm8,%%ymm8 \n\t" > + > + // 32 pixel per loop. > + "1: \n\t" > + "vmovdqu %%ymm8,%%ymm0 \n\t" > + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n\t" > + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n\t" > + "vpxor %%ymm5,%%ymm3,%%ymm3 \n\t" > + "vpxor %%ymm5,%%ymm0,%%ymm0 \n\t" > + > + "vmovdqu (%0),%%ymm1 \n\t" > + "vmovdqu (%1),%%ymm2 \n\t" > + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n\t" > + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n\t" > + "vpsubb %%ymm6,%%ymm4,%%ymm4 \n\t" > + "vpsubb %%ymm6,%%ymm1,%%ymm1 \n\t" > + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n\t" > + "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n\t" > + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n\t" > + "vpaddw %%ymm7,%%ymm0,%%ymm0 \n\t" > + "vpsrlw $0x8,%%ymm3,%%ymm3 \n\t" > + "vpsrlw $0x8,%%ymm0,%%ymm0 \n\t" > + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n\t" > + > + "vmovdqu %%ymm0,(%3) \n\t" > + "lea 0x20(%0),%0 \n\t" > + "lea 0x20(%1),%1 \n\t" > + "lea 0x20(%3),%3 \n\t" > + "sub $0x20,%4 \n\t" > + "jg 1b \n\t" > + "vzeroupper \n\t" > + : "+r"(src0), // %0 > + "+r"(src1), // %1 > + "+r"(alpha), // %2 > + "+r"(dst), // %3 > + "+rm"(aligned_w) // %4 > + ::"memory", > + "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", > "xmm6", > + "xmm7", "xmm8"); > + } > + > + for (i = 0; i < width_u - 1; i += 2) { > + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + > 255) >> 8; > + dst_u[1] = (src0_u[1] * alpha[0] + src1_u[1] * (255 - alpha[0]) + > 255) >> 8; > + src0_u += 2; > + src1_u += 2; > + dst_u += 2; > + } > + if (width_u & 1) { > + dst_u[0] = (src0_u[0] * alpha[0] + src1_u[0] * (255 - alpha[0]) + > 255) >> 8; > + } > +} > +#endif > + > +av_cold blend_row ff_blend_row_init_x86(int global) > +{ > + blend_row blend_row_fn = NULL; > + int cpu_flags = av_get_cpu_flags(); > + > + if (global) { > +#if HAVE_SSSE3_INLINE && HAVE_6REGS > + if (EXTERNAL_SSSE3(cpu_flags)) { > + blend_row_fn = ff_global_blend_row_ssse3; > + } > +#endif > + > +#if HAVE_AVX2_INLINE && HAVE_6REGS > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + blend_row_fn = ff_global_blend_row_avx2; > + } > +#endif > + } else { > +#if HAVE_SSSE3_INLINE && HAVE_6REGS > + if (EXTERNAL_SSSE3(cpu_flags)) { > + blend_row_fn = ff_per_pixel_blend_row_ssse3; > + } > +#endif > + > +#if HAVE_AVX2_INLINE && HAVE_6REGS > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + blend_row_fn = ff_per_pixel_blend_row_avx2; > + } > +#endif > + } > + > + return blend_row_fn; > +} > -- > 1.7.1 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
We don't use inline asm on x86 and we don't use global contexts. Look at how float_dsp is done. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel