On Wed, Jun 15, 2016 at 6:25 AM, Dan Parrot <dan.par...@mail.com> wrote: > This is the first commit addressing Trac ticket #5570. Functions defined in > libswscale/input.c have corresponding definitions in > libswscale/ppc/input_vsx.h > The corresponding function names in the latter contain the suffix "_vsx". > --- > libswscale/input.c | 44 +-- > libswscale/ppc/input_vsx.h | 831 > +++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 853 insertions(+), 22 deletions(-) > create mode 100644 libswscale/ppc/input_vsx.h > > diff --git a/libswscale/input.c b/libswscale/input.c > index 14ab5ab..de4347e 100644 > --- a/libswscale/input.c > +++ b/libswscale/input.c > @@ -40,6 +40,13 @@ > #define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || > origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b) > #define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || > origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r) > > +#ifdef HAVE_VSX > +#include "ppc/input_vsx.h" > +#define RENAME_SIMD(fname) fname ## _vsx > +#elif > +#define RENAME_SIMD(fname) fname > +#endif > + > static av_always_inline void > rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width, > enum AVPixelFormat origin, int32_t *rgb2yuv) > @@ -99,7 +106,7 @@ static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, > const uint8_t *_src, > { \ > const uint16_t *src = (const uint16_t *) _src; \ > uint16_t *dst = (uint16_t *) _dst; \ > - rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \ > + RENAME_SIMD(rgb64ToY_c_template)(dst, src, width, origin, rgb2yuv); \ > } \
This is not how we integrate SIMD optimizations. These are the C functions, they are not meant to perform the SIMD. What you should do is provide SIMD functions and then provide a SIMD-specific init function that overwrites the function pointers with your SIMD functions. ie. just how it is done on x86. But do not touch the C functions by overriding them right in the code with SIMD variants, making the C variants inaccessible. > \ > static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, > \ > @@ -109,7 +116,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t > *_dstU, uint8_t *_dstV, \ > const uint16_t *src1 = (const uint16_t *) _src1, \ > *src2 = (const uint16_t *) _src2; \ > uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ > - rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \ > + RENAME_SIMD(rgb64ToUV_c_template)(dstU, dstV, src1, src2, width, origin, > rgb2yuv); \ > } \ > \ > static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t > *_dstV, \ > @@ -119,7 +126,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t > *_dstU, uint8_t *_dstV > const uint16_t *src1 = (const uint16_t *) _src1, \ > *src2 = (const uint16_t *) _src2; \ > uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \ > - rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, > rgb2yuv); \ > + RENAME_SIMD(rgb64ToUV_half_c_template)(dstU, dstV, src1, src2, width, > origin, rgb2yuv); \ > } > > rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE) > @@ -203,7 +210,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t > *_dst, \ > { \ > const uint16_t *src = (const uint16_t *)_src; \ > uint16_t *dst = (uint16_t *)_dst; \ > - rgb48ToY_c_template(dst, src, width, origin, rgb2yuv); \ > + RENAME_SIMD(rgb48ToY_c_template)(dst, src, width, origin, rgb2yuv); > \ > } \ > \ > static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \ > @@ -218,7 +225,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t > *_dstU, \ > *src2 = (const uint16_t *)_src2; \ > uint16_t *dstU = (uint16_t *)_dstU, \ > *dstV = (uint16_t *)_dstV; \ > - rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); > \ > + RENAME_SIMD(rgb48ToUV_c_template)(dstU, dstV, src1, src2, width, origin, > rgb2yuv); \ > } \ > \ > static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \ > @@ -233,7 +240,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t > *_dstU, \ > *src2 = (const uint16_t *)_src2; \ > uint16_t *dstU = (uint16_t *)_dstU, \ > *dstV = (uint16_t *)_dstV; \ > - rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, > rgb2yuv); \ > + RENAME_SIMD(rgb48ToUV_half_c_template)(dstU, dstV, src1, src2, width, > origin, rgb2yuv); \ > } > > rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE) > @@ -273,7 +280,6 @@ static av_always_inline void > rgb16_32ToY_c_template(int16_t *dst, > dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6); > } > } > - > static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU, > int16_t *dstV, > const uint8_t *src, > @@ -351,17 +357,17 @@ static av_always_inline void > rgb16_32ToUV_half_c_template(int16_t *dstU, > static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t > *unused1, const uint8_t *unused2, \ > int width, uint32_t *tab) \ > { \ > - rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, > shp, \ > - maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \ > + RENAME_SIMD(rgb16_32ToY_c_template)((int16_t*)dst, src, width, fmt, shr, > shg, shb, shp, \ > + maskr, maskg, maskb, rsh, gsh, bsh, > S, tab); \ > } \ > \ > static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \ > const uint8_t *unused0, const uint8_t *src, const > uint8_t *dummy, \ > int width, uint32_t *tab) \ > { \ > - rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, > \ > - shr, shg, shb, shp, \ > - maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\ > + RENAME_SIMD(rgb16_32ToUV_c_template)((int16_t*)dstU, (int16_t*)dstV, > src, width, fmt, \ > + shr, shg, shb, shp, > \ > + maskr, maskg, maskb, rsh, gsh, bsh, > S, tab);\ > } \ > \ > static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \ > @@ -369,10 +375,10 @@ static void name ## ToUV_half_c(uint8_t *dstU, uint8_t > *dstV, \ > const uint8_t *dummy, \ > int width, uint32_t *tab) \ > { \ > - rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, > fmt, \ > - shr, shg, shb, shp, \ > - maskr, maskg, maskb, \ > - rsh, gsh, bsh, S, tab); \ > + RENAME_SIMD(rgb16_32ToUV_half_c_template)((int16_t*)dstU, > (int16_t*)dstV, src, width, fmt, \ > + shr, shg, shb, shp, > \ > + maskr, maskg, maskb, > \ > + rsh, gsh, bsh, S, tab); > \ > } > > rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, > 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8) > @@ -978,7 +984,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) > case AV_PIX_FMT_GBRP9LE: > c->readChrPlanar = planar_rgb9le_to_uv; > break; > - case AV_PIX_FMT_GBRAP10LE: > case AV_PIX_FMT_GBRP10LE: > c->readChrPlanar = planar_rgb10le_to_uv; > break; > @@ -996,7 +1001,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) > case AV_PIX_FMT_GBRP9BE: > c->readChrPlanar = planar_rgb9be_to_uv; > break; > - case AV_PIX_FMT_GBRAP10BE: > case AV_PIX_FMT_GBRP10BE: > c->readChrPlanar = planar_rgb10be_to_uv; > break; > @@ -1260,8 +1264,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) > case AV_PIX_FMT_GBRP9LE: > c->readLumPlanar = planar_rgb9le_to_y; > break; > - case AV_PIX_FMT_GBRAP10LE: > - c->readAlpPlanar = planar_rgb10le_to_a; > case AV_PIX_FMT_GBRP10LE: > c->readLumPlanar = planar_rgb10le_to_y; > break; > @@ -1281,8 +1283,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c) > case AV_PIX_FMT_GBRP9BE: > c->readLumPlanar = planar_rgb9be_to_y; > break; > - case AV_PIX_FMT_GBRAP10BE: > - c->readAlpPlanar = planar_rgb10be_to_a; > case AV_PIX_FMT_GBRP10BE: > c->readLumPlanar = planar_rgb10be_to_y; > break; > diff --git a/libswscale/ppc/input_vsx.h b/libswscale/ppc/input_vsx.h > new file mode 100644 > index 0000000..09fe8c1 > --- /dev/null > +++ b/libswscale/ppc/input_vsx.h > @@ -0,0 +1,831 @@ > +/* > + * Copyright (C) 2016 Dan Parrot <dan.par...@mail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +// This is a SIMD version for IBM POWER8 of function rgb64ToY_c_template > +// in file libswscale/input.c > +static av_always_inline void > +rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width, > + enum AVPixelFormat origin, int32_t *rgb2yuv) > +{ > + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; > + int i, j; > + int num_vec, frag; > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ry = vec_splats((int)ry); > + vector int v_gy = vec_splats((int)gy); > + vector int v_by = vec_splats((int)by); > + > + int s_opr2; > + s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dst; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = input_pixel(&src[(i*8+j)*4+0]); > + int g = input_pixel(&src[(i*8+j)*4+1]); > + int b_r = input_pixel(&src[(i*8+j)*4+2]); > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ry * v_r; > + v_tmp = v_tmp + v_gy * v_g; > + v_tmp = v_tmp + v_by * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dst[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dst[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dst[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dst[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dst, 0, (short *)&dst[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + unsigned int r_b = input_pixel(&src[num_vec*8+i*4+0]); > + unsigned int g = input_pixel(&src[num_vec*8+i*4+1]); > + unsigned int b_r = input_pixel(&src[num_vec*8+i*4+2]); > + > + dst[num_vec*8+i] = (ry*r + gy*g + by*b + > (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > + > +} > + > +// This is a SIMD version for IBM POWER8 of function rgb64ToUV_c_template > +// in file libswscale/input.c > +static av_always_inline void > +rgb64ToUV_c_template_vsx(uint16_t *dstU, uint16_t *dstV, > + const uint16_t *src1, const uint16_t *src2, > + int width, enum AVPixelFormat origin, int32_t *rgb2yuv) > +{ > + > + int i, j; > + int num_vec, frag; > + int s_opr2; > + > + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; > + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; > + av_assert1(src1==src2); > + > + s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + num_vec = width / 8; > + frag = width % 8; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = input_pixel(&src1[(i*8+j)*4+0]); > + int g = input_pixel(&src1[(i*8+j)*4+1]); > + int b_r = input_pixel(&src1[(i*8+j)*4+2]); > + > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + int r_b = input_pixel(&src1[num_vec*8+i*4+0]); > + int g = input_pixel(&src1[num_vec*8+i*4+1]); > + int b_r = input_pixel(&src1[num_vec*8+i*4+2]); > + > + dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + > (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + > (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > + > +} > + > +// This is a SIMD version for IBM POWER8 of function > rgb64ToUV_half_c_template > +// in file libswscale/input.c > +static av_always_inline void > +rgb64ToUV_half_c_template_vsx(uint16_t *dstU, uint16_t *dstV, > + const uint16_t *src1, const uint16_t *src2, > + int width, enum AVPixelFormat origin, int32_t > *rgb2yuv) > +{ > + > + int i, j; > + int num_vec, frag; > + int s_opr2; > + > + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; > + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; > + av_assert1(src1==src2); > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + num_vec = width / 8; > + frag = width % 8; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = (input_pixel(&src1[8 * i * 8 + j + 0]) + > input_pixel(&src1[8 * i * 8 + j + 4]) + 1) >> 1; > + int g = (input_pixel(&src1[8 * i * 8 + j + 1]) + > input_pixel(&src1[8 * i * 8 + j + 5]) + 1) >> 1; > + int b_r = (input_pixel(&src1[8 * i * 8 + j + 2]) + > input_pixel(&src1[8 * i * 8 + j + 6]) + 1) >> 1; > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + int r_b = (input_pixel(&src1[num_vec * 8 + 8 * i + 0]) + > + input_pixel(&src1[num_vec * 8 + 8 * i + 4]) + 1) >> 1; > + int g = (input_pixel(&src1[num_vec * 8 + 8 * i + 1]) + > + input_pixel(&src1[num_vec * 8 + 8 * i + 5]) + 1) >> 1; > + int b_r = (input_pixel(&src1[num_vec * 8 + 8 * i + 2]) + > + input_pixel(&src1[num_vec * 8 + 8 * i + 6]) + 1) >> 1; > + > + dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + > (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + > (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > + > +} > + > +// This is a SIMD version for IBM POWER8 of function rgb48ToY_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb48ToY_c_template_vsx(uint16_t *dst, > + const uint16_t *src, int > width, > + enum AVPixelFormat origin, > + int32_t *rgb2yuv) > +{ > + > + int i, j; > + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX]; > + > + int num_vec, frag; > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ry = vec_splats((int)ry); > + vector int v_gy = vec_splats((int)gy); > + vector int v_by = vec_splats((int)by); > + > + int s_opr2; > + s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dst; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = input_pixel(&src[(i * 8 + j) * 3 + 0]); > + int g = input_pixel(&src[(i * 8 + j) * 3 + 1]); > + int b_r = input_pixel(&src[(i * 8 + j) * 3 + 2]); > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ry * v_r; > + v_tmp = v_tmp + v_gy * v_g; > + v_tmp = v_tmp + v_by * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dst[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dst[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dst[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dst[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dst, 0, (short *)&dst[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + int r_b = input_pixel(&src[(num_vec * 8 + i) * 3 + 0]); > + int g = input_pixel(&src[(num_vec * 8 + i) * 3 + 1]); > + int b_r = input_pixel(&src[(num_vec * 8 + i) * 3 + 2]); > + > + dst[num_vec*8+i] = (ry*r + gy*g + by*b + > (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > + > +} > + > +// This is a SIMD version for IBM POWER8 of function rgb48ToUV_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb48ToUV_c_template_vsx(uint16_t *dstU, > + uint16_t *dstV, > + const uint16_t *src1, > + const uint16_t *src2, > + int width, > + enum AVPixelFormat origin, > + int32_t *rgb2yuv) > +{ > + > + int i, j; > + int num_vec, frag; > + int s_opr2; > + > + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; > + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; > + av_assert1(src1==src2); > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = input_pixel(&src1[(i * 8 + j) * 3 + 0]); > + int g = input_pixel(&src1[(i * 8 + j) * 3 + 1]); > + int b_r = input_pixel(&src1[(i * 8 + j) * 3 + 2]); > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if(!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + int r_b = input_pixel(&src1[num_vec * 8 + i * 3 + 0]); > + int g = input_pixel(&src1[num_vec * 8 + i * 3 + 1]); > + int b_r = input_pixel(&src1[num_vec * 8 + i * 3 + 2]); > + > + dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001 << > (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001 << > (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > + > +} > + > +// This is a SIMD version for IBM POWER8 of function > rgb48ToUV_half_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb48ToUV_half_c_template_vsx(uint16_t *dstU, > + uint16_t *dstV, > + const uint16_t *src1, > + const uint16_t *src2, > + int width, > + enum AVPixelFormat origin, > + int32_t *rgb2yuv) > +{ > + > + int i, j; > + int num_vec, frag; > + > + int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX]; > + int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX]; > + av_assert1(src1==src2); > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + int s_opr2; > + s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1)); > + > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > + vector int v_opr2 = vec_splats((int)s_opr2); > + > + vector int v_r, v_g, v_b, v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + int r_b = (input_pixel(&src1[6 * (i * 8 + j) + 0]) + > + input_pixel(&src1[6 * (i * 8 + j) + 3]) + 1) >> 1; > + int g = (input_pixel(&src1[6 * (i * 8 + j) + 1]) + > + input_pixel(&src1[6 * (i * 8 + j) + 4]) + 1) >> 1; > + int b_r = (input_pixel(&src1[6 * (i * 8 + j) + 2]) + > + input_pixel(&src1[6 * (i * 8 + j) + 5]) + 1) >> 1; > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if(!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_opr2; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]); > + } > + > + // computation for any less than vector-length items at tail end > + if( frag ) { > + for (i = 0; i < frag; i++) { > + int r_b = (input_pixel(&src1[6 * (num_vec * 8 + i) + 0]) + > + input_pixel(&src1[6 * (num_vec * 8 + i) + 3]) + 1) >> > 1; > + int g = (input_pixel(&src1[6 * (num_vec * 8 + i) + 1]) + > + input_pixel(&src1[6 * (num_vec * 8 + i) + 4]) + 1) >> > 1; > + int b_r = (input_pixel(&src1[6 * (num_vec * 8 + i) + 2]) + > + input_pixel(&src1[6 * (num_vec * 8 + i) + 5]) + 1) >> > 1; > + > + dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001 << > (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001 << > (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT; > + } > + } > +} > + > +#pragma push_macro("r") > +#pragma push_macro("b") > +#pragma push_macro("input_pixel") > + > +#undef r > +#undef b > +#undef input_pixel > + > +#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \ > + origin == AV_PIX_FMT_BGRA || \ > + origin == AV_PIX_FMT_ARGB || \ > + origin == AV_PIX_FMT_ABGR) \ > + ? AV_RN32A(&src[(i) * 4]) \ > + : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \ > + : AV_RL16(&src[(i) * 2]))) > + > +// This is a SIMD version for IBM POWER8 of function rgb16_32ToY_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb16_32ToY_c_template_vsx(int16_t *dst, > + const uint8_t *src, > + int width, > + enum AVPixelFormat > origin, > + int shr, int shg, > + int shb, int shp, > + int maskr, int maskg, > + int maskb, int rsh, > + int gsh, int bsh, > int S, > + int32_t *rgb2yuv) > +{ > + const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by > = rgb2yuv[BY_IDX]<<bsh; > + const unsigned rnd = (32<<((S)-1)) + (1<<(S-7)); > + int i, j; > + > + int num_vec, frag; > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ry = vec_splats((int)ry); > + vector int v_gy = vec_splats((int)gy); > + vector int v_by = vec_splats((int)by); > + > + vector int v_rnd = vec_splats((int)rnd); > + vector int v_opr = vec_splats((int)((S)-6)); > + > + vector int v_r, v_b, v_g, v_tmp; > + vector short v_tmpi, v_dst; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0; j--) { > + int px = input_pixel(i * 8 + j) >> shp; > + int b = (px & maskb) >> shb; > + int g = (px & maskg) >> shg; > + int r = (px & maskr) >> shr; > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ry * v_r; > + v_tmp = v_tmp + v_gy * v_g; > + v_tmp = v_tmp + v_by * v_b; > + v_tmp = v_tmp + v_rnd; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr); > + > + v_tmpi = (vector short)v_tmp; > + v_dst[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dst[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dst[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dst[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dst, 0, (short *)&dst[i * 8]); > + } > + > + if ( frag ) { > + for (i = 0; i < frag; i++) { > + int px = input_pixel(num_vec * 8 + i) >> shp; > + int b = (px & maskb) >> shb; > + int g = (px & maskg) >> shg; > + int r = (px & maskr) >> shr; > + > + dst[num_vec * 8 + i] = (ry * r + gy * g + by * b + rnd) >> > ((S)-6); > + } > + } > +} > + > +// This is a SIMD version for IBM POWER8 of function rgb16_32ToUV_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb16_32ToUV_c_template_vsx(int16_t *dstU, > + int16_t *dstV, > + const uint8_t *src, > + int width, > + enum AVPixelFormat > origin, > + int shr, int shg, > + int shb, int shp, > + int maskr, int > maskg, > + int maskb, int rsh, > + int gsh, int bsh, > int S, > + int32_t *rgb2yuv) > +{ > + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << > gsh, bu = rgb2yuv[BU_IDX] << bsh, > + rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << > gsh, bv = rgb2yuv[BV_IDX] << bsh; > + const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7)); > + int i, j; > + > + int num_vec, frag; > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + vector int v_rnd = vec_splats((int)rnd); > + vector int v_opr = vec_splats((int)((S)-6)); > + > + vector int v_r, v_b, v_g; > + vector int v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + for (i = 0; i < num_vec; i++) { > + for(j = 7; j >= 0; j--) { > + int px = input_pixel(i * 8 + j) >> shp; > + int b = (px & maskb) >> shb; > + int g = (px & maskg) >> shg; > + int r = (px & maskr) >> shr; > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_rnd; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_rnd; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i * 8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i * 8]); > + } > + > + if ( frag ) { > + for (i = 0; i < frag; i++) { > + int px = input_pixel(num_vec * 8 + i) >> shp; > + int b = (px & maskb) >> shb; > + int g = (px & maskg) >> shg; > + int r = (px & maskr) >> shr; > + > + dstU[num_vec * 8 + i] = (ru * r + gu * g + bu * b + rnd) >> > ((S)-6); > + dstV[num_vec * 8 + i] = (rv * r + gv * g + bv * b + rnd) >> > ((S)-6); > + } > + } > +} > + > +// This is a SIMD version for IBM POWER8 of function > rgb16_32ToUV_half_c_template > +// in file libswscale/input.c > +static av_always_inline void rgb16_32ToUV_half_c_template_vsx(int16_t *dstU, > + int16_t *dstV, > + const uint8_t > *src, > + int width, > + enum > AVPixelFormat origin, > + int shr, int > shg, > + int shb, int > shp, > + int maskr, int > maskg, > + int maskb, int > rsh, > + int gsh, int > bsh, int S, > + int32_t > *rgb2yuv) > +{ > + const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << > gsh, bu = rgb2yuv[BU_IDX] << bsh, > + rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << > gsh, bv = rgb2yuv[BV_IDX] << bsh, > + maskgx = ~(maskr | maskb); > + const unsigned rnd = (256U<<(S)) + (1<<(S-6)); > + int i, j; > + > + int num_vec, frag; > + > + num_vec = width / 8; > + frag = width % 8; > + > + vector int v_ru = vec_splats((int)ru); > + vector int v_gu = vec_splats((int)gu); > + vector int v_bu = vec_splats((int)bu); > + > + vector int v_rv = vec_splats((int)rv); > + vector int v_gv = vec_splats((int)gv); > + vector int v_bv = vec_splats((int)bv); > + > + vector int v_rnd = vec_splats((int)rnd); > + vector int v_opr = vec_splats((int)((S)-6+1)); > + > + vector int v_r, v_b, v_g; > + vector int v_tmp; > + vector short v_tmpi, v_dstu, v_dstv; > + > + maskr |= maskr << 1; > + maskb |= maskb << 1; > + maskg |= maskg << 1; > + > + for (i = 0; i < num_vec; i++) { > + for (j = 7; j >= 0 ; j--) { > + unsigned px0 = input_pixel(2 * (i * 8 + j) + 0) >> shp; > + unsigned px1 = input_pixel(2 * (i * 8 + j) + 1) >> shp; > + int b, r, g = (px0 & maskgx) + (px1 & maskgx); > + int rb = px0 + px1 - g; > + > + b = (rb & maskb) >> shb; > + if (shp || > + origin == AV_PIX_FMT_BGR565LE || origin == > AV_PIX_FMT_BGR565BE || > + origin == AV_PIX_FMT_RGB565LE || origin == > AV_PIX_FMT_RGB565BE) { > + g >>= shg; > + } else { > + g = (g & maskg) >> shg; > + } > + r = (rb & maskr) >> shr; > + > + v_r[j % 4] = r; > + v_g[j % 4] = g; > + v_b[j % 4] = b; > + > + if (!(j % 4)) { > + v_tmp = v_ru * v_r; > + v_tmp = v_tmp + v_gu * v_g; > + v_tmp = v_tmp + v_bu * v_b; > + v_tmp = v_tmp + v_rnd; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr); > + > + v_tmpi = (vector short)v_tmp; > + v_dstu[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstu[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstu[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstu[(j / 4) * 4 + 0] = v_tmpi[0]; > + > + v_tmp = v_rv * v_r; > + v_tmp = v_tmp + v_gv * v_g; > + v_tmp = v_tmp + v_bv * v_b; > + v_tmp = v_tmp + v_rnd; > + v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr); > + > + v_tmpi = (vector short)v_tmp; > + v_dstv[(j / 4) * 4 + 3] = v_tmpi[6]; > + v_dstv[(j / 4) * 4 + 2] = v_tmpi[4]; > + v_dstv[(j / 4) * 4 + 1] = v_tmpi[2]; > + v_dstv[(j / 4) * 4 + 0] = v_tmpi[0]; > + } > + } > + vec_vsx_st(v_dstu, 0, (short *)&dstU[i * 8]); > + vec_vsx_st(v_dstv, 0, (short *)&dstV[i * 8]); > + } > + > + if ( frag ) { > + for (i = 0; i < frag; i++) { > + unsigned px0 = input_pixel(2 * (num_vec * 8 + i) + 0) >> shp; > + unsigned px1 = input_pixel(2 * (num_vec * 8 + i) + 1) >> shp; > + int b, r, g = (px0 & maskgx) + (px1 & maskgx); > + int rb = px0 + px1 - g; > + > + b = (rb & maskb) >> shb; > + if (shp || > + origin == AV_PIX_FMT_BGR565LE || origin == > AV_PIX_FMT_BGR565BE || > + origin == AV_PIX_FMT_RGB565LE || origin == > AV_PIX_FMT_RGB565BE) { > + g >>= shg; > + } else { > + g = (g & maskg) >> shg; > + } > + r = (rb & maskr) >> shr; > + > + dstU[num_vec * 8 + i] = (ru * r + gu * g + bu * b + > (unsigned)rnd) >> ((S)-6+1); > + dstV[num_vec * 8 + i] = (rv * r + gv * g + bv * b + > (unsigned)rnd) >> ((S)-6+1); > + } > + } > +} > + > +#undef input_pixel > + > +#pragma pop_macro("r") > +#pragma pop_macro("b") > +#pragma pop_macro("input_pixel") > + > -- > 2.4.11 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel