2019-01-09 22:26 GMT+01:00, Carl Eugen Hoyos <ceffm...@gmail.com>: > 2019-01-08 10:11 GMT+01:00, Lauri Kasanen <c...@gmx.com>: >> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt >> yuv420p16be \ >> -s 1920x1728 -f null -vframes 100 -v error -nostats - >> >> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. >> Fate passes, each format tested with an image to video conversion. >> >> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out >> of the 16-bit function. This includes the vec_mulo/mule functions too, >> not just vmuluwm. >> >> yuv420p9le >> 12341 UNITS in planarX, 130976 runs, 96 skips >> 73752 UNITS in planarX, 131066 runs, 6 skips >> yuv420p9be >> 12364 UNITS in planarX, 131025 runs, 47 skips >> 73001 UNITS in planarX, 131055 runs, 17 skips >> yuv420p10le >> 12386 UNITS in planarX, 131042 runs, 30 skips >> 72735 UNITS in planarX, 131062 runs, 10 skips >> yuv420p10be >> 12337 UNITS in planarX, 131045 runs, 27 skips >> 72734 UNITS in planarX, 131057 runs, 15 skips >> yuv420p12le >> 12236 UNITS in planarX, 131058 runs, 14 skips >> 73029 UNITS in planarX, 131062 runs, 10 skips >> yuv420p12be >> 12218 UNITS in planarX, 130973 runs, 99 skips >> 72402 UNITS in planarX, 131069 runs, 3 skips >> yuv420p14le >> 12168 UNITS in planarX, 131067 runs, 5 skips >> 72480 UNITS in planarX, 131069 runs, 3 skips >> yuv420p14be >> 12358 UNITS in planarX, 130948 runs, 124 skips >> 73772 UNITS in planarX, 131063 runs, 9 skips >> yuv420p16le >> 10439 UNITS in planarX, 130911 runs, 161 skips >> 157923 UNITS in planarX, 131068 runs, 4 skips >> yuv420p16be >> 10463 UNITS in planarX, 130874 runs, 198 skips >> 154405 UNITS in planarX, 131061 runs, 11 skips >> >> Signed-off-by: Lauri Kasanen <c...@gmx.com> >> --- >> >> v2: Separate macros so that yuv2plane1_16_vsx remains available for >> power7 >> v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + >> runtime >> check >> >> As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at >> least >> power8, meaning with the current setup such a binary wouldn't run on >> POWER7. >> However using the configure define lets it be disabled in configure like >> Michael >> pointed out, and having the runtime check doesn't hurt any (it allows for >> future >> splits like on x86, where one binary can run on low cpu but use higher >> ISA >> if >> available). >> >> libswscale/ppc/swscale_ppc_template.c | 4 +- >> libswscale/ppc/swscale_vsx.c | 195 >> +++++++++++++++++++++++++++++++++- >> 2 files changed, 193 insertions(+), 6 deletions(-) >> >> diff --git a/libswscale/ppc/swscale_ppc_template.c >> b/libswscale/ppc/swscale_ppc_template.c >> index 00e4b99..11decab 100644 >> --- a/libswscale/ppc/swscale_ppc_template.c >> +++ b/libswscale/ppc/swscale_ppc_template.c >> @@ -21,7 +21,7 @@ >> * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 >> USA >> */ >> >> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, >> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, >> const int16_t **src, uint8_t *dest, >> const uint8_t *dither, int offset, int >> x) >> { >> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int >> filterSize, >> yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, >> 0); >> >> for (i = dst_u; i < dstW - 15; i += 16) >> - FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, >> + FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, >> offset, i); >> >> yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, >> i); >> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c >> index 70da6ae..77680f8 100644 >> --- a/libswscale/ppc/swscale_vsx.c >> +++ b/libswscale/ppc/swscale_vsx.c >> @@ -83,6 +83,8 @@ >> #include "swscale_ppc_template.c" >> #undef FUNC >> >> +#undef vzero >> + >> #endif /* !HAVE_BIGENDIAN */ >> >> static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, >> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, >> uint16_t *dest, int dstW, >> yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); >> } >> >> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, >> + const int16_t **src, uint16_t *dest, int >> dstW, >> + int big_endian, int output_bits, int >> start) >> +{ >> + int i; >> + int shift = 11 + 16 - output_bits; >> + >> + for (i = start; i < dstW; i++) { >> + int val = 1 << (shift - 1); >> + int j; >> + >> + for (j = 0; j < filterSize; j++) >> + val += src[j][i] * filter[j]; >> + >> + output_pixel(&dest[i], val); >> + } >> +} >> + >> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, >> + const int16_t **src, uint16_t *dest, int >> dstW, >> + int big_endian, int output_bits) >> +{ >> + const int dst_u = -(uintptr_t)dest & 7; >> + const int shift = 11 + 16 - output_bits; >> + const int add = (1 << (shift - 1)); >> + const int clip = (1 << output_bits) - 1; >> + const uint16_t swap = big_endian ? 8 : 0; >> + const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; >> + const vector uint32_t vshift = (vector uint32_t) {shift, shift, >> shift, >> shift}; >> + const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, >> swap, swap, swap, swap, swap}; >> + const vector uint16_t vlargest = (vector uint16_t) {clip, clip, >> clip, >> clip, clip, clip, clip, clip}; >> + const vector int16_t vzero = vec_splat_s16(0); >> + const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, >> 11, 4, 5, 12, 13, 6, 7, 14, 15}; >> + vector int16_t vfilter[MAX_FILTER_SIZE], vin; >> + vector uint16_t v; >> + vector uint32_t vleft, vright, vtmp; >> + int i, j; >> + >> + for (i = 0; i < filterSize; i++) { >> + vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], >> filter[i], >> + filter[i], filter[i], filter[i], >> filter[i]}; >> + } >> + >> + yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, >> output_bits, 0); >> + >> + for (i = dst_u; i < dstW - 7; i += 8) { >> + vleft = vright = vadd; >> + >> + for (j = 0; j < filterSize; j++) { >> + vin = vec_vsx_ld(0, &src[j][i]); >> + vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]); >> + vleft = vec_add(vleft, vtmp); >> + vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]); >> + vright = vec_add(vright, vtmp); >> + } >> + >> + vleft = vec_sra(vleft, vshift); >> + vright = vec_sra(vright, vshift); >> + v = vec_packsu(vleft, vright); >> + v = (vector uint16_t) vec_max((vector int16_t) v, vzero); >> + v = vec_min(v, vlargest); >> + v = vec_rl(v, vswap); >> + v = vec_perm(v, v, vperm); >> + vec_st(v, 0, &dest[i]); >> + } >> + >> + yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, >> output_bits, i); >> +} >> + >> + >> #undef output_pixel >> >> #define output_pixel(pos, val, bias, signedness) \ >> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src, >> uint16_t *dest, int dstW, >> yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); >> } >> >> +#ifdef HAVE_POWER8 >> + >> +static void yuv2planeX_16_u(const int16_t *filter, int filterSize, >> + const int32_t **src, uint16_t *dest, int >> dstW, >> + int big_endian, int output_bits, int start) >> +{ >> + int i; >> + int shift = 15; >> + >> + for (i = start; i < dstW; i++) { >> + int val = 1 << (shift - 1); >> + int j; >> + >> + /* range of val is [0,0x7FFFFFFF], so 31 bits, but with >> lanczos/spline >> + * filters (or anything with negative coeffs, the range can be >> slightly >> + * wider in both directions. To account for this overflow, we >> subtract >> + * a constant so it always fits in the signed range (assuming a >> + * reasonable filterSize), and re-add that at the end. */ >> + val -= 0x40000000; >> + for (j = 0; j < filterSize; j++) >> + val += src[j][i] * (unsigned)filter[j]; >> + >> + output_pixel(&dest[i], val, 0x8000, int); >> + } >> +} >> + >> +static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize, >> + const int32_t **src, uint16_t *dest, int >> dstW, >> + int big_endian, int output_bits) >> +{ >> + const int dst_u = -(uintptr_t)dest & 7; >> + const int shift = 15; >> + const int bias = 0x8000; >> + const int add = (1 << (shift - 1)) - 0x40000000; >> + const uint16_t swap = big_endian ? 8 : 0; >> + const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; >> + const vector uint32_t vshift = (vector uint32_t) {shift, shift, >> shift, >> shift}; >> + const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, >> swap, swap, swap, swap, swap}; >> + const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, >> bias, bias, bias, bias, bias}; >> + vector int32_t vfilter[MAX_FILTER_SIZE]; >> + vector uint16_t v; >> + vector uint32_t vleft, vright, vtmp; >> + vector int32_t vin32l, vin32r; >> + int i, j; >> + >> + for (i = 0; i < filterSize; i++) { >> + vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], >> filter[i]}; >> + } >> + >> + yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, >> output_bits, 0); >> + >> + for (i = dst_u; i < dstW - 7; i += 8) { >> + vleft = vright = vadd; >> + >> + for (j = 0; j < filterSize; j++) { >> + vin32l = vec_vsx_ld(0, &src[j][i]); >> + vin32r = vec_vsx_ld(0, &src[j][i + 4]); >> + > >> +#ifdef __GNUC__ >> + // GCC does not support vmuluwm yet. Bug open. >> + __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l), >> "v"(vfilter[j])); >> + vleft = vec_add(vleft, vtmp); >> + __asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r), >> "v"(vfilter[j])); >> + vright = vec_add(vright, vtmp); >> +#else >> + // No idea which compilers this works in, untested. Copied >> from >> libsimdpp >> + vtmp = vec_vmuluwm(vin32l, vfilter[j]); >> + vleft = vec_add(vleft, vtmp); >> + vtmp = vec_vmuluwm(vin32r, vfilter[j]); >> + vright = vec_add(vright, vtmp); >> +#endif > > Is there no xlc installed on your test system? > I suspect an earlier patch from you already > broke xlc compilation...
46c5693ea3a9364e24e2f5336bcdb5b191a2329f is the first bad commit Testing this version: $ make libswscale/ppc/swscale_altivec.o CC libswscale/ppc/swscale_altivec.o warning: 1540-5200 The option "-fomit-frame-pointer" is not supported. warning: 1540-5200 The option "-mabi=altivec" is not supported. warning: 1540-5200 The option "-mvsx" is not supported. warning: 1540-5200 The option "-fno-math-errno" is not supported. warning: 1540-5200 The option "-fno-signed-zeros" is not supported. In file included from src/libswscale/ppc/swscale_altivec.c:28: src/libswscale/swscale_internal.h:641:1: warning: unknown attribute 'cold' ignored [-Wunknown-attributes] av_cold void ff_sws_init_range_convert(SwsContext *c); ^ src/libavutil/attributes.h:82:36: note: expanded from macro 'av_cold' # define av_cold __attribute__((cold)) ^ src/libswscale/ppc/swscale_altivec.c:104:30: warning: unused variable 'perm' [-Wunused-variable] vector unsigned char perm; ^ src/libswscale/ppc/swscale_altivec.c:245:46: warning: unused variable 'src_v0' [-Wunused-variable] vector unsigned char src_vF, src_v0, src_v1; ^ src/libswscale/ppc/swscale_altivec.c:245:54: warning: unused variable 'src_v1' [-Wunused-variable] vector unsigned char src_vF, src_v0, src_v1; ^ src/libswscale/ppc/swscale_altivec.c:246:38: warning: unused variable 'permS' [-Wunused-variable] vector unsigned char permS; ^ src/libswscale/ppc/swscale_altivec.c:295:42: warning: unused variable 'src_v1' [-Wunused-variable] vector unsigned char src_v1, src_vF; ^ src/libswscale/ppc/swscale_altivec.c:296:41: warning: unused variable 'filter_v1R' [-Wunused-variable] vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1; ^ src/libswscale/ppc/swscale_altivec.c:296:53: warning: unused variable 'filter_v2R' [-Wunused-variable] vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1; ^ src/libswscale/ppc/swscale_altivec.c:312:42: warning: unused variable 'src_v1' [-Wunused-variable] vector unsigned char src_v1, src_vF; ^ src/libswscale/ppc/swscale_altivec.c:313:48: warning: unused variable 'filter_v1R' [-Wunused-variable] vector signed short src_v, filter_v1R, filter_v; ^ src/libswscale/ppc/swscale_altivec.c:285:33: warning: unused variable 'offset' [-Wunused-variable] register int j, offset = i * 2 * filterSize; ^ src/libswscale/ppc/swscale_altivec.c:289:37: warning: unused variable 'filter_v0R' [-Wunused-variable] vector signed short filter_v0R; ^ src/libswscale/ppc/swscale_altivec.c:290:38: warning: unused variable 'permF' [-Wunused-variable] vector unsigned char permF, src_v0, permS; ^ src/libswscale/ppc/swscale_altivec.c:290:45: warning: unused variable 'src_v0' [-Wunused-variable] vector unsigned char permF, src_v0, permS; ^ src/libswscale/ppc/swscale_altivec.c:290:53: warning: unused variable 'permS' [-Wunused-variable] vector unsigned char permF, src_v0, permS; ^ src/libswscale/ppc/swscale_altivec.c:344:11: error: unknown type name 'vector' const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7}; Carl Eugen _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel