2019-01-08 10:11 GMT+01:00, Lauri Kasanen <c...@gmx.com>: > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > yuv420p16be \ > -s 1920x1728 -f null -vframes 100 -v error -nostats - > > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. > Fate passes, each format tested with an image to video conversion. > > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out > of the 16-bit function. This includes the vec_mulo/mule functions too, > not just vmuluwm. > > yuv420p9le > 12341 UNITS in planarX, 130976 runs, 96 skips > 73752 UNITS in planarX, 131066 runs, 6 skips > yuv420p9be > 12364 UNITS in planarX, 131025 runs, 47 skips > 73001 UNITS in planarX, 131055 runs, 17 skips > yuv420p10le > 12386 UNITS in planarX, 131042 runs, 30 skips > 72735 UNITS in planarX, 131062 runs, 10 skips > yuv420p10be > 12337 UNITS in planarX, 131045 runs, 27 skips > 72734 UNITS in planarX, 131057 runs, 15 skips > yuv420p12le > 12236 UNITS in planarX, 131058 runs, 14 skips > 73029 UNITS in planarX, 131062 runs, 10 skips > yuv420p12be > 12218 UNITS in planarX, 130973 runs, 99 skips > 72402 UNITS in planarX, 131069 runs, 3 skips > yuv420p14le > 12168 UNITS in planarX, 131067 runs, 5 skips > 72480 UNITS in planarX, 131069 runs, 3 skips > yuv420p14be > 12358 UNITS in planarX, 130948 runs, 124 skips > 73772 UNITS in planarX, 131063 runs, 9 skips > yuv420p16le > 10439 UNITS in planarX, 130911 runs, 161 skips > 157923 UNITS in planarX, 131068 runs, 4 skips > yuv420p16be > 10463 UNITS in planarX, 130874 runs, 198 skips > 154405 UNITS in planarX, 131061 runs, 11 skips > > Signed-off-by: Lauri Kasanen <c...@gmx.com> > --- > > v2: Separate macros so that yuv2plane1_16_vsx remains available for power7 > v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime > check > > As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at > least > power8, meaning with the current setup such a binary wouldn't run on POWER7. > However using the configure define lets it be disabled in configure like > Michael > pointed out, and having the runtime check doesn't hurt any (it allows for > future > splits like on x86, where one binary can run on low cpu but use higher ISA > if > available). > > libswscale/ppc/swscale_ppc_template.c | 4 +- > libswscale/ppc/swscale_vsx.c | 195 > +++++++++++++++++++++++++++++++++- > 2 files changed, 193 insertions(+), 6 deletions(-) > > diff --git a/libswscale/ppc/swscale_ppc_template.c > b/libswscale/ppc/swscale_ppc_template.c > index 00e4b99..11decab 100644 > --- a/libswscale/ppc/swscale_ppc_template.c > +++ b/libswscale/ppc/swscale_ppc_template.c > @@ -21,7 +21,7 @@ > * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > */ > > -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, > +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, > const int16_t **src, uint8_t *dest, > const uint8_t *dither, int offset, int x) > { > @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int > filterSize, > yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); > > for (i = dst_u; i < dstW - 15; i += 16) > - FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, > + FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, > offset, i); > > yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); > diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c > index 70da6ae..77680f8 100644 > --- a/libswscale/ppc/swscale_vsx.c > +++ b/libswscale/ppc/swscale_vsx.c > @@ -83,6 +83,8 @@ > #include "swscale_ppc_template.c" > #undef FUNC > > +#undef vzero > + > #endif /* !HAVE_BIGENDIAN */ > > static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, > @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, > uint16_t *dest, int dstW, > yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); > } > > +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, > + const int16_t **src, uint16_t *dest, int > dstW, > + int big_endian, int output_bits, int start) > +{ > + int i; > + int shift = 11 + 16 - output_bits; > + > + for (i = start; i < dstW; i++) { > + int val = 1 << (shift - 1); > + int j; > + > + for (j = 0; j < filterSize; j++) > + val += src[j][i] * filter[j]; > + > + output_pixel(&dest[i], val); > + } > +} > + > +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, > + const int16_t **src, uint16_t *dest, int > dstW, > + int big_endian, int output_bits) > +{ > + const int dst_u = -(uintptr_t)dest & 7; > + const int shift = 11 + 16 - output_bits; > + const int add = (1 << (shift - 1)); > + const int clip = (1 << output_bits) - 1; > + const uint16_t swap = big_endian ? 8 : 0; > + const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; > + const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, > shift}; > + const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, > swap, swap, swap, swap, swap}; > + const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, > clip, clip, clip, clip, clip}; > + const vector int16_t vzero = vec_splat_s16(0); > + const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, > 11, 4, 5, 12, 13, 6, 7, 14, 15}; > + vector int16_t vfilter[MAX_FILTER_SIZE], vin; > + vector uint16_t v; > + vector uint32_t vleft, vright, vtmp; > + int i, j; > + > + for (i = 0; i < filterSize; i++) { > + vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], > filter[i], > + filter[i], filter[i], filter[i], > filter[i]}; > + } > + > + yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, > output_bits, 0); > + > + for (i = dst_u; i < dstW - 7; i += 8) { > + vleft = vright = vadd; > + > + for (j = 0; j < filterSize; j++) { > + vin = vec_vsx_ld(0, &src[j][i]); > + vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]); > + vleft = vec_add(vleft, vtmp); > + vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]); > + vright = vec_add(vright, vtmp); > + } > + > + vleft = vec_sra(vleft, vshift); > + vright = vec_sra(vright, vshift); > + v = vec_packsu(vleft, vright); > + v = (vector uint16_t) vec_max((vector int16_t) v, vzero); > + v = vec_min(v, vlargest); > + v = vec_rl(v, vswap); > + v = vec_perm(v, v, vperm); > + vec_st(v, 0, &dest[i]); > + } > + > + yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, > output_bits, i); > +} > + > + > #undef output_pixel > > #define output_pixel(pos, val, bias, signedness) \ > @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src, > uint16_t *dest, int dstW, > yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); > } > > +#ifdef HAVE_POWER8 > + > +static void yuv2planeX_16_u(const int16_t *filter, int filterSize, > + const int32_t **src, uint16_t *dest, int dstW, > + int big_endian, int output_bits, int start) > +{ > + int i; > + int shift = 15; > + > + for (i = start; i < dstW; i++) { > + int val = 1 << (shift - 1); > + int j; > + > + /* range of val is [0,0x7FFFFFFF], so 31 bits, but with > lanczos/spline > + * filters (or anything with negative coeffs, the range can be > slightly > + * wider in both directions. To account for this overflow, we > subtract > + * a constant so it always fits in the signed range (assuming a > + * reasonable filterSize), and re-add that at the end. */ > + val -= 0x40000000; > + for (j = 0; j < filterSize; j++) > + val += src[j][i] * (unsigned)filter[j]; > + > + output_pixel(&dest[i], val, 0x8000, int); > + } > +} > + > +static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize, > + const int32_t **src, uint16_t *dest, int > dstW, > + int big_endian, int output_bits) > +{ > + const int dst_u = -(uintptr_t)dest & 7; > + const int shift = 15; > + const int bias = 0x8000; > + const int add = (1 << (shift - 1)) - 0x40000000; > + const uint16_t swap = big_endian ? 8 : 0; > + const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; > + const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, > shift}; > + const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, > swap, swap, swap, swap, swap}; > + const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, > bias, bias, bias, bias, bias}; > + vector int32_t vfilter[MAX_FILTER_SIZE]; > + vector uint16_t v; > + vector uint32_t vleft, vright, vtmp; > + vector int32_t vin32l, vin32r; > + int i, j; > + > + for (i = 0; i < filterSize; i++) { > + vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], > filter[i]}; > + } > + > + yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, > output_bits, 0); > + > + for (i = dst_u; i < dstW - 7; i += 8) { > + vleft = vright = vadd; > + > + for (j = 0; j < filterSize; j++) { > + vin32l = vec_vsx_ld(0, &src[j][i]); > + vin32r = vec_vsx_ld(0, &src[j][i + 4]); > +
> +#ifdef __GNUC__ > + // GCC does not support vmuluwm yet. Bug open. > + __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l), > "v"(vfilter[j])); > + vleft = vec_add(vleft, vtmp); > + __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r), > "v"(vfilter[j])); > + vright = vec_add(vright, vtmp); > +#else > + // No idea which compilers this works in, untested. Copied from > libsimdpp > + vtmp = vec_vmuluwm(vin32l, vfilter[j]); > + vleft = vec_add(vleft, vtmp); > + vtmp = vec_vmuluwm(vin32r, vfilter[j]); > + vright = vec_add(vright, vtmp); > +#endif Is there no xlc installed on your test system? I suspect an earlier patch from you already broke xlc compilation... Carl Eugen _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel