[FFmpeg-devel] [PATCH v2] swscale: Remove duplicated code
In this function, the exact same clamping happens both in the if and unconditionally. Signed-off-by: Lauri Kasanen --- libswscale/output.c | 10 -- 1 file changed, 10 deletions(-) v2: Remove the unconditional instead of the if'd clipping. I'll leave changing the bit pattern to others, there's so many funcs using 0x100. diff --git a/libswscale/output.c b/libswscale/output.c index d7c53e6..d3401f0 100644 --- a/libswscale/output.c +++ b/libswscale/output.c @@ -853,11 +853,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0, V = av_clip_uint8(V); } -Y1 = av_clip_uint8(Y1); -Y2 = av_clip_uint8(Y2); -U = av_clip_uint8(U); -V = av_clip_uint8(V); - output_pixels(i * 4, Y1, U, Y2, V); } } else { @@ -875,11 +870,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0, V = av_clip_uint8(V); } -Y1 = av_clip_uint8(Y1); -Y2 = av_clip_uint8(Y2); -U = av_clip_uint8(U); -V = av_clip_uint8(V); - output_pixels(i * 4, Y1, U, Y2, V); } } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2] swscale: Remove duplicated code
On Tue, 26 Mar 2019 22:00:54 +0100 Michael Niedermayer wrote: > On Tue, Mar 26, 2019 at 08:58:34AM +0200, Lauri Kasanen wrote: > > In this function, the exact same clamping happens both in the if and > > unconditionally. > > > > Signed-off-by: Lauri Kasanen > > --- > > libswscale/output.c | 10 -- > > 1 file changed, 10 deletions(-) > > > > v2: Remove the unconditional instead of the if'd clipping. > > I'll leave changing the bit pattern to others, there's so many funcs using > > 0x100. > > > > diff --git a/libswscale/output.c b/libswscale/output.c > > index d7c53e6..d3401f0 100644 > > --- a/libswscale/output.c > > +++ b/libswscale/output.c > > should be ok > > thanks Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 resend] swscale/ppc: VSX-optimize yuv2rgb_full
On Thu, 21 Mar 2019 09:54:17 +0200 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ > -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > This uses 32-bit mul, so POWER8 only. > > The following output formats get about 4.5x speedup: > > rgb24 > 39980 UNITS in yuv2packed1, 32768 runs, 0 skips >8774 UNITS in yuv2packed1, 32768 runs, 0 skips > bgr24 > 40069 UNITS in yuv2packed1, 32768 runs, 0 skips >8772 UNITS in yuv2packed1, 32766 runs, 2 skips > rgba > 39759 UNITS in yuv2packed1, 32768 runs, 0 skips >8681 UNITS in yuv2packed1, 32767 runs, 1 skips > bgra > 39729 UNITS in yuv2packed1, 32768 runs, 0 skips >8696 UNITS in yuv2packed1, 32766 runs, 2 skips > argb > 39766 UNITS in yuv2packed1, 32768 runs, 0 skips >8672 UNITS in yuv2packed1, 32766 runs, 2 skips > bgra > 39784 UNITS in yuv2packed1, 32768 runs, 0 skips >8659 UNITS in yuv2packed1, 32767 runs, 1 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 291 > +++ > 1 file changed, 291 insertions(+) > > v2: HAVE_POWER8 from ifdef to if > Resending due to mail client troubles Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] This patch addresses Trac ticket #5570. The optimized functions are in file libswscale/ppc/input_vsx.c. Each optimized function name is a concatenation of the corresponding
On Fri, 29 Mar 2019 17:00:38 +0300 Вячеслав wrote: > --- > libswscale/ppc/Makefile |3 +- > libswscale/ppc/input_vsx.c| 3801 > + > libswscale/swscale.c |3 + > libswscale/swscale_internal.h |1 + > 4 files changed, 3807 insertions(+), 1 deletion(-) > create mode 100644 libswscale/ppc/input_vsx.c Please include performance benchmarks for each function. The description should go in the patch main part, not in the title. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/3] swscale/ppc: VSX-optimize yuv2422_1
On Sun, 24 Mar 2019 15:10:35 +0200 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ > -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > 15.3x speedup: > > yuyv422 > 14513 UNITS in yuv2packed1, 32768 runs, 0 skips > 949 UNITS in yuv2packed1, 32767 runs, 1 skips > yvyu422 > 14516 UNITS in yuv2packed1, 32767 runs, 1 skips > 943 UNITS in yuv2packed1, 32767 runs, 1 skips > uyvy422 > 14530 UNITS in yuv2packed1, 32767 runs, 1 skips > 941 UNITS in yuv2packed1, 32766 runs, 2 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 149 > +++ > 1 file changed, 149 insertions(+) Applying these. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_1
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags fast_bilinear \ -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 32-bit mul, power8 only. 1.8-2.3x speedup: rgb24 18192 UNITS in yuv2packed1, 32767 runs, 1 skips 9983 UNITS in yuv2packed1, 32760 runs, 8 skips bgr24 18665 UNITS in yuv2packed1, 32766 runs, 2 skips 9925 UNITS in yuv2packed1, 32763 runs, 5 skips rgba 20239 UNITS in yuv2packed1, 32767 runs, 1 skips 8794 UNITS in yuv2packed1, 32759 runs, 9 skips bgra 20354 UNITS in yuv2packed1, 32768 runs, 0 skips 8770 UNITS in yuv2packed1, 32761 runs, 7 skips argb 20185 UNITS in yuv2packed1, 32768 runs, 0 skips 8761 UNITS in yuv2packed1, 32761 runs, 7 skips bgra 20360 UNITS in yuv2packed1, 32766 runs, 2 skips 8759 UNITS in yuv2packed1, 32764 runs, 4 skips This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx version is also heavily inaccurate, while the vsx version has high accuracy. Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 425 +-- 1 file changed, 330 insertions(+), 95 deletions(-) Okay, so I'm a bit unsure what to do here. I'm sure it could be faster if made as inaccurate as the mmx version, but that differs a lot from the C version, which itself is inaccurate vs the _full C and vsx versions. There are no other versions than mmx and C to compare against. I took the approach of using the accurate _full YUV logic, just writing two pixels for each UV value. The C version uses a LUT, resulting in ~1/255 rounding errors in most pixels compared to the accurate C/VSX _full logic. The MMX version does low accuracy logic, differing from the C LUT as much as 10/255 per pixel. Speed or accuracy? IMHO the mmx errors are far too large. diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 69ec63d..0ac8cac 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -422,6 +422,104 @@ yuv2NBPSX(16, BE, 1, 16, int32_t) yuv2NBPSX(16, LE, 0, 16, int32_t) #endif +#define WRITERGB \ +R_l = vec_max(R_l, zero32); \ +R_r = vec_max(R_r, zero32); \ +G_l = vec_max(G_l, zero32); \ +G_r = vec_max(G_r, zero32); \ +B_l = vec_max(B_l, zero32); \ +B_r = vec_max(B_r, zero32); \ +\ +R_l = vec_min(R_l, rgbclip); \ +R_r = vec_min(R_r, rgbclip); \ +G_l = vec_min(G_l, rgbclip); \ +G_r = vec_min(G_r, rgbclip); \ +B_l = vec_min(B_l, rgbclip); \ +B_r = vec_min(B_r, rgbclip); \ +\ +R_l = vec_sr(R_l, shift22); \ +R_r = vec_sr(R_r, shift22); \ +G_l = vec_sr(G_l, shift22); \ +G_r = vec_sr(G_r, shift22); \ +B_l = vec_sr(B_l, shift22); \ +B_r = vec_sr(B_r, shift22); \ +\ +rd16 = vec_packsu(R_l, R_r); \ +gd16 = vec_packsu(G_l, G_r); \ +bd16 = vec_packsu(B_l, B_r); \ +rd = vec_packsu(rd16, zero16); \ +gd = vec_packsu(gd16, zero16); \ +bd = vec_packsu(bd16, zero16); \ +\ +switch(target) { \ +case AV_PIX_FMT_RGB24: \ +out0 = vec_perm(rd, gd, perm3rg0); \ +out0 = vec_perm(out0, bd, perm3tb0); \ +out1 = vec_perm(rd, gd, perm3rg1); \ +out1 = vec_perm(out1, bd, perm3tb1); \ +\ +vec_vsx_st(out0, 0, dest); \ +vec_vsx_st(out1, 16, dest); \ +\ +dest += 24; \ +break; \ +case AV_PIX_FMT_BGR24: \ +out0 = vec_perm(bd, gd, perm3rg0); \ +out0 = vec_perm(out0, rd, perm3tb0); \ +out1 = vec_perm(bd, gd, perm3rg1); \ +out1 = vec_perm(out1, rd, perm3tb1); \ +\ +vec_vsx_st(out0, 0, dest); \ +vec_vsx_st(out1, 16, dest); \ +\ +dest += 24; \ +break; \ +case AV_PIX_FMT_BGRA: \ +out0 = vec_mergeh(bd, gd); \ +out1 = vec_mergeh(rd, ad); \ +\ +tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \ +vec_vsx_st(tmp8, 0, dest); \ +tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \ +vec_vsx_st(tmp8, 16, dest); \ +\ +dest += 32; \ +break; \ +case AV_PIX_FMT_RGBA: \ +out0 = vec_mergeh(rd, gd); \ +out1 = vec_mergeh(bd, ad); \ +\ +tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \ +vec_vsx_st(tmp8, 0, dest); \ +tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \ +vec_vsx_st(tmp8, 16, dest); \ +\ +dest += 32; \ +break; \ +case AV_PIX_FMT_ARGB: \ +out0 = vec_mergeh(ad, rd); \ +out1 = vec_mer
Re: [FFmpeg-devel] [PATCH] This patch addresses Trac ticket #5570. The optimized functions are in file libswscale/ppc/input_vsx.c. Each optimized function name is a concatenation of the corresponding
On Mon, 1 Apr 2019 09:07:48 +0300 slava wrote: > Sorry for title. It is my first experience in git send-email. Can I make > a benchmark with handwritten tests or have some standard tool in ffmeg? > And will the benchmark on x86-64 be informative? We have standard bench macros, START_TIMER and STOP_TIMER. Put those around the function's callsite, then do some ffmpeg run that calls that specific function. Then add "-cpuflags 0" to the call to get the C results, and from the numbers you can calculate the speedup. Both the C and VSX runs should be done on the POWER machine. A Qemu VM, emulating POWER instructions on x86-64, would probably be useless for benchmark purposes. There are free POWER VMs available for testing from a few places. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_2
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags area \ -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 32-bit mul, power8 only. ~4x speedup: rgb24 52763 UNITS in yuv2packed2, 16384 runs, 0 skips 13453 UNITS in yuv2packed2, 16384 runs, 0 skips bgr24 53144 UNITS in yuv2packed2, 16384 runs, 0 skips 13616 UNITS in yuv2packed2, 16384 runs, 0 skips rgba 52796 UNITS in yuv2packed2, 16384 runs, 0 skips 12904 UNITS in yuv2packed2, 16384 runs, 0 skips bgra 52732 UNITS in yuv2packed2, 16384 runs, 0 skips 13262 UNITS in yuv2packed2, 16384 runs, 0 skips argb 52661 UNITS in yuv2packed2, 16384 runs, 0 skips 12879 UNITS in yuv2packed2, 16384 runs, 0 skips bgra 52662 UNITS in yuv2packed2, 16384 runs, 0 skips 12932 UNITS in yuv2packed2, 16384 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 166 +++ 1 file changed, 166 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 0ac8cac..6ff8b62 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -520,6 +520,148 @@ yuv2NBPSX(16, LE, 0, 16, int32_t) break; \ } +#define SETUP(x, buf0, alpha1, buf1, alpha) { \ +x = vec_ld(0, buf0); \ +tmp = vec_mule(x, alpha1); \ +tmp2 = vec_mulo(x, alpha1); \ +tmp3 = vec_mergeh(tmp, tmp2); \ +tmp4 = vec_mergel(tmp, tmp2); \ +\ +x = vec_ld(0, buf1); \ +tmp = vec_mule(x, alpha); \ +tmp2 = vec_mulo(x, alpha); \ +tmp5 = vec_mergeh(tmp, tmp2); \ +tmp6 = vec_mergel(tmp, tmp2); \ +\ +tmp3 = vec_add(tmp3, tmp5); \ +tmp4 = vec_add(tmp4, tmp6); \ +} + + +static av_always_inline void +yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, int dstW, + int yalpha, int uvalpha, int y, + enum AVPixelFormat target, int hasAlpha) +{ +const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1], + *abuf0 = hasAlpha ? abuf[0] : NULL, + *abuf1 = hasAlpha ? abuf[1] : NULL; +const int16_t yalpha1 = 4096 - yalpha; +const int16_t uvalpha1 = 4096 - uvalpha; +vector int16_t vy, vu, vv, A = vec_splat_s16(0); +vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32; +vector int32_t R_l, R_r, G_l, G_r, B_l, B_r; +vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6; +vector uint16_t rd16, gd16, bd16; +vector uint8_t rd, bd, gd, ad, out0, out1, tmp8; +const vector int16_t vyalpha1 = vec_splats(yalpha1); +const vector int16_t vuvalpha1 = vec_splats(uvalpha1); +const vector int16_t vyalpha = vec_splats((int16_t) yalpha); +const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha); +const vector uint16_t zero16 = vec_splat_u16(0); +const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset); +const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff); +const vector int32_t y_add = vec_splats(1 << 21); +const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff); +const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff); +const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff); +const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff); +const vector int32_t rgbclip = vec_splats(1 << 30); +const vector int32_t zero32 = vec_splat_s32(0); +const vector uint32_t shift19 = vec_splats(19U); +const vector uint32_t shift22 = vec_splats(22U); +const vector uint32_t shift10 = vec_splat_u32(10); +const vector int32_t dec128 = vec_splats(128 << 19); +const vector int32_t add18 = vec_splats(1 << 18); +int i; + +// Various permutations +const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0, + 0x1, 0x11, 0, + 0x2, 0x12, 0, + 0x3, 0x13, 0, + 0x4, 0x14, 0, + 0x5 }; +const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0, + 0x6, 0x16, 0, + 0x7, 0x17, 0 }; +const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10, + 0x3, 0x4, 0x11, + 0x6, 0x7, 0x12, +
[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_X
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 32-bit mul, power8 only. ~6.4x speedup: rgb24 214278 UNITS in yuv2packedX, 16384 runs, 0 skips 33249 UNITS in yuv2packedX, 16384 runs, 0 skips bgr24 214616 UNITS in yuv2packedX, 16384 runs, 0 skips 33233 UNITS in yuv2packedX, 16384 runs, 0 skips rgba 214517 UNITS in yuv2packedX, 16384 runs, 0 skips 33271 UNITS in yuv2packedX, 16384 runs, 0 skips bgra 214973 UNITS in yuv2packedX, 16384 runs, 0 skips 33397 UNITS in yuv2packedX, 16384 runs, 0 skips argb 214613 UNITS in yuv2packedX, 16384 runs, 0 skips 33310 UNITS in yuv2packedX, 16384 runs, 0 skips bgra 214637 UNITS in yuv2packedX, 16384 runs, 0 skips 0 UNITS in yuv2packedX, 16384 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 160 +++ 1 file changed, 160 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 6ff8b62..e05f9ec 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -520,6 +520,139 @@ yuv2NBPSX(16, LE, 0, 16, int32_t) break; \ } +static av_always_inline void +yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter, + const int16_t **lumSrc, int lumFilterSize, + const int16_t *chrFilter, const int16_t **chrUSrc, + const int16_t **chrVSrc, int chrFilterSize, + const int16_t **alpSrc, uint8_t *dest, + int dstW, int y, enum AVPixelFormat target, int hasAlpha) +{ +vector int16_t vv; +vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32; +vector int32_t R_l, R_r, G_l, G_r, B_l, B_r; +vector int32_t tmp, tmp2, tmp3, tmp4; +vector uint16_t rd16, gd16, bd16; +vector uint8_t rd, bd, gd, ad, out0, out1, tmp8; +vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE]; +const vector int32_t ystart = vec_splats(1 << 9); +const vector int32_t uvstart = vec_splats((1 << 9) - (128 << 19)); +const vector uint16_t zero16 = vec_splat_u16(0); +const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset); +const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff); +const vector int32_t y_add = vec_splats(1 << 21); +const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff); +const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff); +const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff); +const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff); +const vector int32_t rgbclip = vec_splats(1 << 30); +const vector int32_t zero32 = vec_splat_s32(0); +const vector uint32_t shift22 = vec_splats(22U); +const vector uint32_t shift10 = vec_splat_u32(10); +int i, j; + +// Various permutations +const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0, + 0x1, 0x11, 0, + 0x2, 0x12, 0, + 0x3, 0x13, 0, + 0x4, 0x14, 0, + 0x5 }; +const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0, + 0x6, 0x16, 0, + 0x7, 0x17, 0 }; +const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10, + 0x3, 0x4, 0x11, + 0x6, 0x7, 0x12, + 0x9, 0xa, 0x13, + 0xc, 0xd, 0x14, + 0xf }; +const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15, + 0x2, 0x3, 0x16, + 0x5, 0x6, 0x17 }; + +ad = vec_splats((uint8_t) 255); + +for (i = 0; i < lumFilterSize; i++) +vlumFilter[i] = vec_splats(lumFilter[i]); +for (i = 0; i < chrFilterSize; i++) +vchrFilter[i] = vec_splats(chrFilter[i]); + +for (i = 0; i < dstW; i += 8) { +vy32_l = +vy32_r = ystart; +vu32_l = +vu32_r = +vv32_l = +vv32_r = uvstart; + +for (j = 0; j < lumFilterSize; j++) { +vv = vec_ld(0, &lumSrc[j][i]); +tmp = vec_mule(vv, vlumFilter[j]); +tmp2 = vec_mulo(vv, vlumFilter[j
[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags fast_bilinear \ -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ -cpuflags 0 -v error - 32-bit mul, power8 only. ~2x speedup: rgb24 24431 UNITS in yuv2packed2, 16384 runs, 0 skips 13783 UNITS in yuv2packed2, 16383 runs, 1 skips bgr24 24396 UNITS in yuv2packed2, 16384 runs, 0 skips 14059 UNITS in yuv2packed2, 16384 runs, 0 skips rgba 26815 UNITS in yuv2packed2, 16383 runs, 1 skips 12797 UNITS in yuv2packed2, 16383 runs, 1 skips bgra 27060 UNITS in yuv2packed2, 16384 runs, 0 skips 13138 UNITS in yuv2packed2, 16384 runs, 0 skips argb 26998 UNITS in yuv2packed2, 16384 runs, 0 skips 12728 UNITS in yuv2packed2, 16381 runs, 3 skips bgra 26651 UNITS in yuv2packed2, 16384 runs, 0 skips 13124 UNITS in yuv2packed2, 16384 runs, 0 skips This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx version is also heavily inaccurate, while the vsx version has high accuracy. Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 188 +++ 1 file changed, 188 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index e05f9ec..ba00791 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -793,6 +793,180 @@ yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2], } } +static av_always_inline void +yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2], + const int16_t *ubuf[2], const int16_t *vbuf[2], + const int16_t *abuf[2], uint8_t *dest, int dstW, + int yalpha, int uvalpha, int y, + enum AVPixelFormat target, int hasAlpha) +{ +const int16_t *buf0 = buf[0], *buf1 = buf[1], + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1], + *abuf0 = hasAlpha ? abuf[0] : NULL, + *abuf1 = hasAlpha ? abuf[1] : NULL; +const int16_t yalpha1 = 4096 - yalpha; +const int16_t uvalpha1 = 4096 - uvalpha; +vector int16_t vy, vu, vv, A = vec_splat_s16(0); +vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32; +vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r; +vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6; +vector uint16_t rd16, gd16, bd16; +vector uint8_t rd, bd, gd, ad, out0, out1, tmp8; +const vector int16_t vyalpha1 = vec_splats(yalpha1); +const vector int16_t vuvalpha1 = vec_splats(uvalpha1); +const vector int16_t vyalpha = vec_splats((int16_t) yalpha); +const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha); +const vector uint16_t zero16 = vec_splat_u16(0); +const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset); +const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff); +const vector int32_t y_add = vec_splats(1 << 21); +const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff); +const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff); +const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff); +const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff); +const vector int32_t rgbclip = vec_splats(1 << 30); +const vector int32_t zero32 = vec_splat_s32(0); +const vector uint32_t shift19 = vec_splats(19U); +const vector uint32_t shift22 = vec_splats(22U); +const vector uint32_t shift10 = vec_splat_u32(10); +const vector int32_t dec128 = vec_splats(128 << 19); +const vector int32_t add18 = vec_splats(1 << 18); +int i; + +// Various permutations +const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3, +0, 1, 2, 3, +4, 5, 6, 7, +4, 5, 6, 7 }; +const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11, +8, 9, 10, 11, +12, 13, 14, 15, +12, 13, 14, 15 }; +const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0, + 0x1, 0x11, 0, + 0x2, 0x12, 0, + 0x3, 0x13, 0, + 0x4, 0x14, 0, + 0x5 }; +const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0, + 0x6, 0x16, 0, +
Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_2
On Mon, 1 Apr 2019 13:13:59 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags area \ > -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > 32-bit mul, power8 only. > > ~4x speedup: > > rgb24 > 52763 UNITS in yuv2packed2, 16384 runs, 0 skips > 13453 UNITS in yuv2packed2, 16384 runs, 0 skips > bgr24 > 53144 UNITS in yuv2packed2, 16384 runs, 0 skips > 13616 UNITS in yuv2packed2, 16384 runs, 0 skips > rgba > 52796 UNITS in yuv2packed2, 16384 runs, 0 skips > 12904 UNITS in yuv2packed2, 16384 runs, 0 skips > bgra > 52732 UNITS in yuv2packed2, 16384 runs, 0 skips > 13262 UNITS in yuv2packed2, 16384 runs, 0 skips > argb > 52661 UNITS in yuv2packed2, 16384 runs, 0 skips > 12879 UNITS in yuv2packed2, 16384 runs, 0 skips > bgra > 52662 UNITS in yuv2packed2, 16384 runs, 0 skips > 12932 UNITS in yuv2packed2, 16384 runs, 0 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 166 > +++ > 1 file changed, 166 insertions(+) Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_1
On Sun, 31 Mar 2019 17:18:47 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags > fast_bilinear \ > -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > 32-bit mul, power8 only. > > 1.8-2.3x speedup: > > rgb24 > 18192 UNITS in yuv2packed1, 32767 runs, 1 skips >9983 UNITS in yuv2packed1, 32760 runs, 8 skips > bgr24 > 18665 UNITS in yuv2packed1, 32766 runs, 2 skips >9925 UNITS in yuv2packed1, 32763 runs, 5 skips > rgba > 20239 UNITS in yuv2packed1, 32767 runs, 1 skips >8794 UNITS in yuv2packed1, 32759 runs, 9 skips > bgra > 20354 UNITS in yuv2packed1, 32768 runs, 0 skips >8770 UNITS in yuv2packed1, 32761 runs, 7 skips > argb > 20185 UNITS in yuv2packed1, 32768 runs, 0 skips >8761 UNITS in yuv2packed1, 32761 runs, 7 skips > bgra > 20360 UNITS in yuv2packed1, 32766 runs, 2 skips >8759 UNITS in yuv2packed1, 32764 runs, 4 skips > > This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx > version > is also heavily inaccurate, while the vsx version has high accuracy. Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_X
On Mon, 1 Apr 2019 13:37:32 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ > -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > 32-bit mul, power8 only. > > ~6.4x speedup: > > rgb24 > 214278 UNITS in yuv2packedX, 16384 runs, 0 skips > 33249 UNITS in yuv2packedX, 16384 runs, 0 skips > bgr24 > 214616 UNITS in yuv2packedX, 16384 runs, 0 skips > 33233 UNITS in yuv2packedX, 16384 runs, 0 skips > rgba > 214517 UNITS in yuv2packedX, 16384 runs, 0 skips > 33271 UNITS in yuv2packedX, 16384 runs, 0 skips > bgra > 214973 UNITS in yuv2packedX, 16384 runs, 0 skips > 33397 UNITS in yuv2packedX, 16384 runs, 0 skips > argb > 214613 UNITS in yuv2packedX, 16384 runs, 0 skips > 33310 UNITS in yuv2packedX, 16384 runs, 0 skips > bgra > 214637 UNITS in yuv2packedX, 16384 runs, 0 skips > 33330 UNITS in yuv2packedX, 16384 runs, 0 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 160 > +++ > 1 file changed, 160 insertions(+) Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2] Added XV Support
On Mon, 8 Apr 2019 06:39:27 +0800 Steven Liu wrote: > >+.long_name = NULL_IF_CONFIG_SMALL("Xunlie Video File"), XV is a video output format, so please make the title something like "flv: Add XV (Xunlie Video) support". - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2
On Fri, 5 Apr 2019 11:41:19 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags > fast_bilinear \ > -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \ > -cpuflags 0 -v error - > > 32-bit mul, power8 only. > > ~2x speedup: > > rgb24 > 24431 UNITS in yuv2packed2, 16384 runs, 0 skips > 13783 UNITS in yuv2packed2, 16383 runs, 1 skips > bgr24 > 24396 UNITS in yuv2packed2, 16384 runs, 0 skips > 14059 UNITS in yuv2packed2, 16384 runs, 0 skips > rgba > 26815 UNITS in yuv2packed2, 16383 runs, 1 skips > 12797 UNITS in yuv2packed2, 16383 runs, 1 skips > bgra > 27060 UNITS in yuv2packed2, 16384 runs, 0 skips > 13138 UNITS in yuv2packed2, 16384 runs, 0 skips > argb > 26998 UNITS in yuv2packed2, 16384 runs, 0 skips > 12728 UNITS in yuv2packed2, 16381 runs, 3 skips > bgra > 26651 UNITS in yuv2packed2, 16384 runs, 0 skips > 13124 UNITS in yuv2packed2, 16384 runs, 0 skips > > This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx > version > is also heavily inaccurate, while the vsx version has high accuracy. > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 188 > +++ > 1 file changed, 188 insertions(+) Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH]lavc/alac: Make a variable unsigned
On Thu, 18 Apr 2019 13:53:37 +0200 Carl Eugen Hoyos wrote: > Hi! > > Attached patch silences a warning that is shown with some gcc versions. It pokes my style sense to have different things in the sizeof() and the var. How about uint32_t in both? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH]lavc/alac: Make a variable unsigned
On Thu, 18 Apr 2019 15:07:03 +0200 Hendrik Leppkes wrote: > On Thu, Apr 18, 2019 at 2:54 PM Lauri Kasanen wrote: > > > > On Thu, 18 Apr 2019 13:53:37 +0200 > > Carl Eugen Hoyos wrote: > > > > > Hi! > > > > > > Attached patch silences a warning that is shown with some gcc versions. > > > > It pokes my style sense to have different things in the sizeof() and > > the var. How about uint32_t in both? > > > > Those two things are entirely unrelated types, though. Indeed, my bad. Please ignore. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize hscale_fast
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags fast_bilinear \ -s 2400x720 -f rawvideo -vframes 5 -pix_fmt abgr -nostats test.raw 4.27 speedup for hyscale_fast: 24796 UNITS in hyscale_fast,4096 runs, 0 skips 5797 UNITS in hyscale_fast,4096 runs, 0 skips 4.48 speedup for hcscale_fast: 19911 UNITS in hcscale_fast,4095 runs, 1 skips 4437 UNITS in hcscale_fast,4096 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 196 +++ 1 file changed, 196 insertions(+) This has the same limit as the x86 version, same width or larger only. Shrinking would require a gather load, which doesn't exist on PPC and is slow even on x86 AVX. I tried a manual gather load, and the vector function was 20% slower than C. diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index ba00791..2e20ab3 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1661,6 +1661,198 @@ YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422) YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422) YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422) +static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth, + const uint8_t *src, int srcW, int xInc) +{ +int i; +unsigned int xpos = 0, xx; +vector uint8_t vin, vin2, vperm; +vector int8_t vmul, valpha; +vector int16_t vtmp, vtmp2, vtmp3, vtmp4; +vector uint16_t vd_l, vd_r, vcoord16[2]; +vector uint32_t vcoord[4]; +const vector uint32_t vadd = (vector uint32_t) { +0, +xInc * 1, +xInc * 2, +xInc * 3, +}; +const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math +0, +xInc * 1, +xInc * 2, +xInc * 3, +xInc * 4, +xInc * 5, +xInc * 6, +xInc * 7, +}; +const vector uint32_t vshift16 = vec_splats((uint32_t) 16); +const vector uint16_t vshift9 = vec_splat_u16(9); +const vector uint8_t vzero = vec_splat_u8(0); +const vector uint16_t vshift = vec_splat_u16(7); + +for (i = 0; i < dstWidth; i += 16) { +vcoord16[0] = vec_splats((uint16_t) xpos); +vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8)); + +vcoord16[0] = vec_add(vcoord16[0], vadd16); +vcoord16[1] = vec_add(vcoord16[1], vadd16); + +vcoord16[0] = vec_sr(vcoord16[0], vshift9); +vcoord16[1] = vec_sr(vcoord16[1], vshift9); +valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]); + +xx = xpos >> 16; +vin = vec_vsx_ld(0, &src[xx]); + +vcoord[0] = vec_splats(xpos & 0x); +vcoord[1] = vec_splats((xpos & 0x) + xInc * 4); +vcoord[2] = vec_splats((xpos & 0x) + xInc * 8); +vcoord[3] = vec_splats((xpos & 0x) + xInc * 12); + +vcoord[0] = vec_add(vcoord[0], vadd); +vcoord[1] = vec_add(vcoord[1], vadd); +vcoord[2] = vec_add(vcoord[2], vadd); +vcoord[3] = vec_add(vcoord[3], vadd); + +vcoord[0] = vec_sr(vcoord[0], vshift16); +vcoord[1] = vec_sr(vcoord[1], vshift16); +vcoord[2] = vec_sr(vcoord[2], vshift16); +vcoord[3] = vec_sr(vcoord[3], vshift16); + +vcoord16[0] = vec_pack(vcoord[0], vcoord[1]); +vcoord16[1] = vec_pack(vcoord[2], vcoord[3]); +vperm = vec_pack(vcoord16[0], vcoord16[1]); + +vin = vec_perm(vin, vin, vperm); + +vin2 = vec_vsx_ld(1, &src[xx]); +vin2 = vec_perm(vin2, vin2, vperm); + +vmul = (vector int8_t) vec_sub(vin2, vin); +vtmp = vec_mule(vmul, valpha); +vtmp2 = vec_mulo(vmul, valpha); +vtmp3 = vec_mergeh(vtmp, vtmp2); +vtmp4 = vec_mergel(vtmp, vtmp2); + +vd_l = (vector uint16_t) vec_mergeh(vin, vzero); +vd_r = (vector uint16_t) vec_mergel(vin, vzero); +vd_l = vec_sl(vd_l, vshift); +vd_r = vec_sl(vd_r, vshift); + +vd_l = vec_add(vd_l, (vector uint16_t) vtmp3); +vd_r = vec_add(vd_r, (vector uint16_t) vtmp4); + +vec_st((vector int16_t) vd_l, 0, &dst[i]); +vec_st((vector int16_t) vd_r, 0, &dst[i + 8]); + +xpos += xInc * 16; +} +for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) +dst[i] = src[srcW-1]*128; +} + +#define HCSCALE(in, out) \ +vin = vec_vsx_ld(0, &in[xx]); \ +vin = vec_perm(vin, vin, vperm); \ +\ +vin2 = vec_vsx_ld(1, &in[xx]); \ +vin2 = vec_perm(vin2, vin2, vperm); \ +\ +vtmp = vec_mule(vin, valphaxor); \ +vtmp2 = vec_mulo(vin, valphaxor); \ +vtmp3 = vec_mergeh(vtmp, vtmp2); \ +vtmp4 = vec_mergel(vtmp, vtmp2); \ +\ +vtmp = vec_mule(vin2, valpha); \ +vtmp2 = vec_mulo(vin2, valpha); \ +vd_l = vec_mergeh(vtmp, vtmp2); \ +vd_r = ve
Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize hscale_fast
On Wed, 24 Apr 2019 14:02:16 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags > fast_bilinear \ > -s 2400x720 -f rawvideo -vframes 5 -pix_fmt abgr -nostats test.raw > > 4.27 speedup for hyscale_fast: > 24796 UNITS in hyscale_fast,4096 runs, 0 skips >5797 UNITS in hyscale_fast,4096 runs, 0 skips > > 4.48 speedup for hcscale_fast: > 19911 UNITS in hcscale_fast,4095 runs, 1 skips >4437 UNITS in hcscale_fast,4096 runs, 0 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 196 > +++ > 1 file changed, 196 insertions(+) > > This has the same limit as the x86 version, same width or larger only. > Shrinking would require a gather load, which doesn't exist on PPC and is slow > even on x86 AVX. I tried a manual gather load, and the vector function was 20% > slower than C. Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/4] swscale/ppc: VSX-optimize hScale8To19_vsx
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p16le -nostats test.raw 2.26 speedup (x86 SSE2 is 2.32): 23772 UNITS in hscale,4096 runs, 0 skips 53862 UNITS in hscale,4096 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 64 +++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 2e20ab3..a82cf95 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1853,6 +1853,64 @@ static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2, #undef HCSCALE +static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW, +const uint8_t *src, const int16_t *filter, +const int32_t *filterPos, int filterSize) +{ +int i, j; +int32_t *dst = (int32_t *) _dst; +vector int16_t vfilter, vin; +vector uint8_t vin8; +vector int32_t vout; +const vector uint8_t vzero = vec_splat_u8(0); +const vector uint8_t vunusedtab[8] = { +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, +(vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10}, +}; +const vector uint8_t vunused = vunusedtab[filterSize % 8]; + +if (filterSize == 1) { +for (i = 0; i < dstW; i++) { +int srcPos = filterPos[i]; +int val= 0; +for (j = 0; j < filterSize; j++) { +val += ((int)src[srcPos + j]) * filter[filterSize * i + j]; +} +dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ... +} +} else { +for (i = 0; i < dstW; i++) { +const int srcPos = filterPos[i]; +vout = vec_splat_s32(0); +for (j = 0; j < filterSize; j += 8) { +vin8 = vec_vsx_ld(0, &src[srcPos + j]); +vin = (vector int16_t) vec_mergeh(vin8, vzero); +if (j + 8 > filterSize) // Remove the unused elements on the last round +vin = vec_perm(vin, (vector int16_t) vzero, vunused); + +vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]); +vout = vec_msums(vin, vfilter, vout); +} +vout = vec_sums(vout, (vector int32_t) vzero); +dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1); +} +} +} + #endif /* !HAVE_BIGENDIAN */ #endif /* HAVE_VSX */ @@ -1867,12 +1925,16 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) return; #if !HAVE_BIGENDIAN -if (c->srcBpc == 8 && c->dstBpc <= 14) { +if (c->srcBpc == 8) { +if (c->dstBpc <= 14) { c->hyScale = c->hcScale = hScale_real_vsx; if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { c->hyscale_fast = hyscale_fast_vsx; c->hcscale_fast = hcscale_fast_vsx; } +} else { +c->hyScale = c->hcScale = hScale8To19_vsx; +} } if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 && -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/4] swscale/ppc: Indent
Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index a82cf95..17c15a2 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1927,13 +1927,13 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) #if !HAVE_BIGENDIAN if (c->srcBpc == 8) { if (c->dstBpc <= 14) { -c->hyScale = c->hcScale = hScale_real_vsx; -if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { -c->hyscale_fast = hyscale_fast_vsx; -c->hcscale_fast = hcscale_fast_vsx; -} +c->hyScale = c->hcScale = hScale_real_vsx; +if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) { +c->hyscale_fast = hyscale_fast_vsx; +c->hcscale_fast = hcscale_fast_vsx; +} } else { -c->hyScale = c->hcScale = hScale8To19_vsx; +c->hyScale = c->hcScale = hScale8To19_vsx; } } if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/4] swscale/ppc: VSX-optimize hScale16To*
./ffmpeg -loop 1 -s 1200x1440 -i tux16.png \ -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p16le -nostats test.raw ./ffmpeg -loop 1 -s 1200x1440 -i tux16.png \ -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p -nostats test.raw 32-bit mul, power8 only 2x speedup for hScale8To19_vsx (x86 SSE2 is 2.37): 30896 UNITS in hscale,8192 runs, 0 skips 63956 UNITS in hscale,8192 runs, 0 skips 2.06 for hScale16To15_vsx: 30531 UNITS in hscale,8192 runs, 0 skips 63161 UNITS in hscale,8192 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 159 +++ 1 file changed, 159 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 17c15a2..31d3ba2 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1911,6 +1911,160 @@ static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW, } } +static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW, + const uint8_t *_src, const int16_t *filter, + const int32_t *filterPos, int filterSize) +{ +const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat); +int i, j; +int32_t *dst= (int32_t *) _dst; +const uint16_t *src = (const uint16_t *) _src; +int bits= desc->comp[0].depth - 1; +int sh = bits - 4; +vector int16_t vfilter, vin; +vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r; +const vector uint8_t vzero = vec_splat_u8(0); +const vector uint8_t vunusedtab[8] = { +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, +(vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10}, +}; +const vector uint8_t vunused = vunusedtab[filterSize % 8]; + +if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) { +sh = 9; +} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */ +sh = 16 - 1 - 4; +} + +if (filterSize == 1) { +for (i = 0; i < dstW; i++) { +int srcPos = filterPos[i]; +int val= 0; + +for (j = 0; j < filterSize; j++) { +val += src[srcPos + j] * filter[filterSize * i + j]; +} +// filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit +dst[i] = FFMIN(val >> sh, (1 << 19) - 1); +} +} else { +for (i = 0; i < dstW; i++) { +const int srcPos = filterPos[i]; +vout = vec_splat_s32(0); +for (j = 0; j < filterSize; j += 8) { +vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]); +if (j + 8 > filterSize) // Remove the unused elements on the last round +vin = vec_perm(vin, (vector int16_t) vzero, vunused); + +vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]); +vfilter32_l = vec_unpackh(vfilter); +vfilter32_r = vec_unpackl(vfilter); + +vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero); +vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero); + +vtmp = vec_mul(vtmp, vfilter32_l); +vtmp2 = vec_mul(vtmp2, vfilter32_r); + +vout = vec_adds(vout, vtmp); +vout = vec_adds(vout, vtmp2); +} +vout = vec_sums(vout, (vector int32_t) vzero); +dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1); +} +} +} + +static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW, + const uint8_t *_src, const int16_t *filter, + const int32_t *filterPos, int filterSize) +{ +const A
[FFmpeg-devel] [PATCH 4/4] swscale/ppc: Shorten power8 tests via a var
Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 27 ++- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 31d3ba2..a617f76 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -2074,6 +2074,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) #if HAVE_VSX enum AVPixelFormat dstFormat = c->dstFormat; const int cpu_flags = av_get_cpu_flags(); +const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8; if (!(cpu_flags & AV_CPU_FLAG_VSX)) return; @@ -2090,7 +2091,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) c->hyScale = c->hcScale = hScale8To19_vsx; } } else { -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx : hScale16To15_vsx; } @@ -2144,21 +2145,21 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) if (c->flags & SWS_FULL_CHR_H_INT) { switch (dstFormat) { case AV_PIX_FMT_RGB24: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { c->yuv2packed1 = yuv2rgb24_full_1_vsx; c->yuv2packed2 = yuv2rgb24_full_2_vsx; c->yuv2packedX = yuv2rgb24_full_X_vsx; } break; case AV_PIX_FMT_BGR24: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { c->yuv2packed1 = yuv2bgr24_full_1_vsx; c->yuv2packed2 = yuv2bgr24_full_2_vsx; c->yuv2packedX = yuv2bgr24_full_X_vsx; } break; case AV_PIX_FMT_BGRA: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2bgrx32_full_1_vsx; c->yuv2packed2 = yuv2bgrx32_full_2_vsx; @@ -2167,7 +2168,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) } break; case AV_PIX_FMT_RGBA: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2rgbx32_full_1_vsx; c->yuv2packed2 = yuv2rgbx32_full_2_vsx; @@ -2176,7 +2177,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) } break; case AV_PIX_FMT_ARGB: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2xrgb32_full_1_vsx; c->yuv2packed2 = yuv2xrgb32_full_2_vsx; @@ -2185,7 +2186,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) } break; case AV_PIX_FMT_ABGR: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2xbgr32_full_1_vsx; c->yuv2packed2 = yuv2xbgr32_full_2_vsx; @@ -2212,7 +2213,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) c->yuv2packedX = yuv2uyvy422_X_vsx; break; case AV_PIX_FMT_BGRA: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2bgrx32_1_vsx; c->yuv2packed2 = yuv2bgrx32_2_vsx; @@ -2220,7 +2221,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) } break; case AV_PIX_FMT_RGBA: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2rgbx32_1_vsx; c->yuv2packed2 = yuv2rgbx32_2_vsx; @@ -2228,7 +2229,7 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) } break; case AV_PIX_FMT_ARGB: -if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) { +if (power8) { if (!c->needAlpha) { c->yuv2packed1 = yuv2xrgb32_1_vsx; c->yuv2packed2 = yuv2xrgb32_2_vs
Re: [FFmpeg-devel] [PATCH 3/4] swscale/ppc: VSX-optimize hScale16To*
./ffmpeg -loop 1 -s 1200x1440 -i tux16.png \ -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p16le -nostats test.raw ./ffmpeg -loop 1 -s 1200x1440 -i tux16.png \ -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p -nostats test.raw 32-bit mul, power8 only 2x speedup for hScale8To19_vsx (x86 SSE2 is 2.37): 30896 UNITS in hscale,8192 runs, 0 skips 63956 UNITS in hscale,8192 runs, 0 skips 2.06 for hScale16To15_vsx: 30531 UNITS in hscale,8192 runs, 0 skips 63161 UNITS in hscale,8192 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 159 +++ 1 file changed, 159 insertions(+) Resending due to mail client. diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 17c15a2..31d3ba2 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -1911,6 +1911,160 @@ static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW, } } +static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW, + const uint8_t *_src, const int16_t *filter, + const int32_t *filterPos, int filterSize) +{ +const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat); +int i, j; +int32_t *dst= (int32_t *) _dst; +const uint16_t *src = (const uint16_t *) _src; +int bits= desc->comp[0].depth - 1; +int sh = bits - 4; +vector int16_t vfilter, vin; +vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r; +const vector uint8_t vzero = vec_splat_u8(0); +const vector uint8_t vunusedtab[8] = { +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf}, +(vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10}, +(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10}, +}; +const vector uint8_t vunused = vunusedtab[filterSize % 8]; + +if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) { +sh = 9; +} else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */ +sh = 16 - 1 - 4; +} + +if (filterSize == 1) { +for (i = 0; i < dstW; i++) { +int srcPos = filterPos[i]; +int val= 0; + +for (j = 0; j < filterSize; j++) { +val += src[srcPos + j] * filter[filterSize * i + j]; +} +// filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit +dst[i] = FFMIN(val >> sh, (1 << 19) - 1); +} +} else { +for (i = 0; i < dstW; i++) { +const int srcPos = filterPos[i]; +vout = vec_splat_s32(0); +for (j = 0; j < filterSize; j += 8) { +vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]); +if (j + 8 > filterSize) // Remove the unused elements on the last round +vin = vec_perm(vin, (vector int16_t) vzero, vunused); + +vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]); +vfilter32_l = vec_unpackh(vfilter); +vfilter32_r = vec_unpackl(vfilter); + +vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero); +vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero); + +vtmp = vec_mul(vtmp, vfilter32_l); +vtmp2 = vec_mul(vtmp2, vfilter32_r); + +vout = vec_adds(vout, vtmp); +vout = vec_adds(vout, vtmp2); +} +vout = vec_sums(vout, (vector int32_t) vzero); +dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1); +} +} +} + +static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW, + const uint8_t *_src, const int16_t *filter, + const int32_t *fil
Re: [FFmpeg-devel] [PATCH 1/4] swscale/ppc: VSX-optimize hScale8To19_vsx
Copy-paste thinko in the title I see. Will remove the _vsx suffix from the title. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH V5 1/2] configure: sort decoder/encoder/filter/... names in alphabet order
On Wed, 1 May 2019 22:57:47 +0200 Carl Eugen Hoyos wrote: > 2019-04-28 3:18 GMT+02:00, Alexander Strasser : > > > What do you think about using awk instead of shell? > > Do we only use awk for --enable-random and the dependency > files so far? Does configure also work without awk now and > would this change? It seems awk is unconditionally required already. However I wanted to say that it's a very nice dep to have: easy to build, present almost everywhere, even in busybox. Nothing like perl/tcl or worse, python/java/rust/go. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/4] swscale/ppc: VSX-optimize hScale8To19_vsx
On Tue, 30 Apr 2019 14:43:52 +0300 Lauri Kasanen wrote: > ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \ > -s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p16le -nostats > test.raw > > 2.26 speedup (x86 SSE2 is 2.32): > 23772 UNITS in hscale,4096 runs, 0 skips > 53862 UNITS in hscale,4096 runs, 0 skips > > Signed-off-by: Lauri Kasanen > --- > libswscale/ppc/swscale_vsx.c | 64 > +++- > 1 file changed, 63 insertions(+), 1 deletion(-) Applying the series. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42
On Thu, 9 May 2019 22:59:12 -0700 Philip Langdale wrote: > I don't think this is terribly useful, as the only thing out there that > can even handle NV24 content is VDPAU and the only time you have to > deal with it is when doing VDPAU OpenGL interop where swscale is > irrelevant. In the other cases you can use YV24 (YUV444P). > > But anyway, I was asked to do this for the sake of completeness. > > The implementation is pretty straight-forward. Most of the existing > NV12 codepaths work regardless of subsampling and are re-used as is. > Where necessary I wrote the slightly different NV24 versions. > > Finally, the one thing that confused me for a long time was the > asm specific x86 path that did an explicit exclusion check for NV12. > I replaced that with a semi-planar check and also updated the > equivalent PPC code, but which I cannot test. I'm having trouble making out what formats exactly isSemiPlanarYUV() matches. Are you sure it's an equivalent check? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42
On Fri, 10 May 2019 08:07:45 -0700 Philip Langdale wrote: > On Fri, 10 May 2019 09:35:40 +0300 > Lauri Kasanen wrote: > > > > > I'm having trouble making out what formats exactly isSemiPlanarYUV() > > matches. Are you sure it's an equivalent check? > > > > Well, the check's been in there for quite a while; that's not new. > > (isPlanarYUV(pix_fmt) && desc->comp[1].plane == desc->comp[2].plane); > > So, any planar yuv format where component 1 and component 2 are on the > same plane. Except for semi planar formats, you expect either all > components on the same plane (packed, so not planar) or every component > on a separate plain (normal planar). Yes, I understand that. I mean: can you list all formats that function matches? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42
On Fri, 10 May 2019 10:08:57 -0700 Philip Langdale wrote: > On 2019-05-10 08:12, Lauri Kasanen wrote: > > On Fri, 10 May 2019 08:07:45 -0700 > > Philip Langdale wrote: > > > >> On Fri, 10 May 2019 09:35:40 +0300 > >> Lauri Kasanen wrote: > >> > >> > > >> > I'm having trouble making out what formats exactly isSemiPlanarYUV() > >> > matches. Are you sure it's an equivalent check? > >> > > >> > >> Well, the check's been in there for quite a while; that's not new. > >> > >> (isPlanarYUV(pix_fmt) && desc->comp[1].plane == desc->comp[2].plane); > >> > >> So, any planar yuv format where component 1 and component 2 are on the > >> same plane. Except for semi planar formats, you expect either all > >> components on the same plane (packed, so not planar) or every > >> component > >> on a separate plain (normal planar). > > > > Yes, I understand that. I mean: can you list all formats that function > > matches? > > For formats that swscale understands: > > NV12, NV21 > P010(BE|LE) > P016(BE|LE) > > and now NV24, NV42. > > There are also NV16 and NV20(BE|LE) formats which are not supported by > swscale. Thanks. Then the ppc part looks ok to me. Please include that list in the commit message too. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time
On Fri, 7 Jun 2019 08:38:35 -0700 Adrian Tong wrote: > Hi > > I have a workload which spends a significant amount of time (~10%) in > the yuv420_bgr24_mmxext function in FFMEPG. > > I looked at the assembly and profile and see MMX (64 bit) registers are > used. I wonder whether we can have a SSE2 version which has a register bit > width of 128. > > I am very interested in implementing such support if it is possible. I'm not well versed in x86 vectors, so I can't say if SSE2 is enough or some other SSE version would be needed, but certainly YUV to RGB conversion can be done faster than with MMX. Please do send a patch. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time
On Sat, 8 Jun 2019 06:51:51 -0700 Adrian Tong wrote: > Hi Lauri. > > Thanks for the reply, any reason why this has not been implemented before ? > it seems to me that this would be a pretty important/hot function. Just the usual, nobody has had the interest. There are other places too where the only x86 accel is mmx. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time
On Mon, 10 Jun 2019 17:42:00 -0700 Adrian Tong wrote: > I have been trying to implement yuv420_to_bgr24 using SSE2 instruction. I > ran into the case where the output of C implemented yuv420_to_bgr24 has > slightly different resulting bgr24 image from MMX implemented > yuv420_to_bgr24. Is this expected behavior ? Yes, some of the MMX implementations choose speed over accuracy, I ran to that myself when doing PPC versions. For a SSE version, if an accurate version is fast enough, please try to match the C version. Otherwise try to match MMX. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 0/2] AltiVec/VSX fixes in swscale
Hi, I approve of this series, but being in the middle of a move, I can't test it. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH V3] swscale/ppc/yuv2rgb_altivec: Fixes compiler bug - replace vec_lvsl/vec_perm with vec_xl
Hi, This change uses VSX code in a function marked Altivec, aka it makes it not work on pre-power7 (macs, etc). As such I would've NAK'd it. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] configure: use -r, not -E, for sed
Old versions of sed do not support the -E option. Signed-off-by: Lauri Kasanen --- configure | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index b02b4cc..51f1227 100755 --- a/configure +++ b/configure @@ -3722,7 +3722,7 @@ find_things_extern(){ find_filters_extern(){ file=$source_path/$1 #sed -n "s/^extern AVFilter ff_\([avfsinkrc]\{2,5\}\)_\(\w\+\);/\2_filter/p" $file -sed -E -n "s/^extern AVFilter ff_([avfsinkrc]{2,5})_([a-zA-Z0-9_]+);/\2_filter/p" $file +sed -r -n "s/^extern AVFilter ff_([avfsinkrc]{2,5})_([a-zA-Z0-9_]+);/\2_filter/p" $file } FILTER_LIST=$(find_filters_extern libavfilter/allfilters.c) @@ -5188,7 +5188,7 @@ case $target_os in is_in -isysroot $ld $LDFLAGS || check_ldflags -isysroot $sysroot fi version_script='-exported_symbols_list' -VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -E "s/(.+)/_\1/g" | sed -E "s/(.+[^*])/\1*/"' +VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -r "s/(.+)/_\1/g" | sed -r "s/(.+[^*])/\1*/"' ;; msys*) die "Native MSYS builds are discouraged, please use the MINGW environment." -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize yuv2plane1_8
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p \ -f null -vframes 100 -v error -nostats - 1158 UNITS in planar1, 65528 runs, 8 skips -cpuflags 0 19082 UNITS in planar1, 65533 runs, 3 skips 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version takes as many cycles as the x86 SSE2 version, yikes it's fast. Note that this function uses VSX instructions, but is not marked so. This is because several existing functions also make that mistake. I'll submit a patch moving them all once this is reviewed. No BE support since I can only test LE. LE is however the common case for POWER8 and POWER9. Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_altivec.c | 55 1 file changed, 55 insertions(+) diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 2fb2337..a064016 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -324,6 +324,53 @@ static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW, } } } + +static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset, int start) +{ +int i; +for (i = start; i < dstW; i++) { +int val = (src[i] + dither[(i + offset) & 7]) >> 7; +dest[i] = av_clip_uint8(val); +} +} + +static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ +const int dst_u = -(uintptr_t)dest & 15; +int i, j; +LOCAL_ALIGNED(16, int16_t, val, [16]); +const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7}; +vector int16_t vi, vileft, ditherleft, ditherright; +vector uint8_t vd; + +for (j = 0; j < 16; j++) { +val[j] = dither[(dst_u + offset + j) & 7]; +} + +ditherleft = vec_ld(0, val); +ditherright = vec_ld(0, &val[8]); + +yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0); + +for (i = dst_u; i < dstW - 15; i += 16) { + +vi = vec_vsx_ld(0, &src[i]); +vi = vec_adds(ditherleft, vi); +vileft = vec_sra(vi, shifts); + +vi = vec_vsx_ld(0, &src[i + 8]); +vi = vec_adds(ditherright, vi); +vi = vec_sra(vi, shifts); + +vd = vec_packsu(vileft, vi); +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_8_u(src, dest, dstW, dither, offset, i); +} + #endif /* HAVE_ALTIVEC */ av_cold void ff_sws_init_swscale_ppc(SwsContext *c) @@ -367,6 +414,14 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c) c->yuv2packedX = ff_yuv2rgb24_X_altivec; break; } + +switch (c->dstBpc) { +case 8: +#if !HAVE_BIGENDIAN +c->yuv2plane1 = yuv2plane1_8_altivec; +break; +#endif /* !HAVE_BIGENDIAN */ +} } #endif /* HAVE_ALTIVEC */ } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] configure: use -r, not -E, for sed
On Fri, 16 Nov 2018 22:36:16 +0100 Carl Eugen Hoyos wrote: > 2018-11-15 15:00 GMT+01:00, Lauri Kasanen : > > Old versions of sed do not support the -E option. > > > -VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n > > /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -E "s/(.+)/_\1/g" | sed > > -E "s/(.+[^*])/\1*/"' > > +VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n > > /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -r "s/(.+)/_\1/g" | sed > > -r "s/(.+[^*])/\1*/"' > > Could you try to replace the current command with one that > neither needs "-E" nor "-r"? > Your suggestions fixes antique Linux systems but not current > non-Linux Posix systems (and contradicts the documentation). Regexes tend to be write-only. Not sure I can parse what that tries to do, to rewrite it in basic RE that posix sed supports. What do you mean by contradicts docs? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize yuv2plane1_8
On Fri, 16 Nov 2018 22:09:25 +0100 Carl Eugen Hoyos wrote: > (This is less important atm, but I believe all functions currently > in libswscale/ppc compile and run fine on - old - 32bit be hardware > as your new function does. > My completely inexperienced suspicion is that the instruction that > you call "VSX" also exists on Altivec.) Ref http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06 VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7. They shouldn't compile on earlier PPC like Apple G4/G5. Is your machine at least POWER7? > I wanted to write that this hunk breaks compilation on big-endian > (you should be able to test with "#if 0" instead of "#if !HAVE_BIGENDIAN") > but the good news is that your patch works fine on big-endian, > just remove the if-endif block. (Tested visually with lena on 32 and 64bit > be.) Thanks, will do. > Are you aware of the bounty that is offered for this task? > https://trac.ffmpeg.org/ticket/5568 > (and #5569, #5570) Yes, I admit that's why I started. Looking to make some extra, and helping IBM is not a bad way to do so. I'm considering getting a Raptor Blackbird when it comes out next year. > There is a bug report about one altivec routine that works on > big-endian but breaks the output visually on little-endian while > many other functions work on both, could you have a look? > https://trac.ffmpeg.org/ticket/7124 I'll try. This patch was my first time playing with Power vectors. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p \ -f null -vframes 100 -v error -nostats - 1158 UNITS in planar1, 65528 runs, 8 skips -cpuflags 0 19082 UNITS in planar1, 65533 runs, 3 skips 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version takes as many cycles as the x86 SSE2 version, yikes it's fast. Note that this function uses VSX instructions, but is not marked so. This is because several existing functions also make that mistake. I'll submit a patch moving them once this is reviewed. v2: Remove !BE check Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_altivec.c | 53 1 file changed, 53 insertions(+) diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 2fb2337..8c6056d 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -324,6 +324,53 @@ static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW, } } } + +static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset, int start) +{ +int i; +for (i = start; i < dstW; i++) { +int val = (src[i] + dither[(i + offset) & 7]) >> 7; +dest[i] = av_clip_uint8(val); +} +} + +static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset) +{ +const int dst_u = -(uintptr_t)dest & 15; +int i, j; +LOCAL_ALIGNED(16, int16_t, val, [16]); +const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7}; +vector int16_t vi, vileft, ditherleft, ditherright; +vector uint8_t vd; + +for (j = 0; j < 16; j++) { +val[j] = dither[(dst_u + offset + j) & 7]; +} + +ditherleft = vec_ld(0, val); +ditherright = vec_ld(0, &val[8]); + +yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0); + +for (i = dst_u; i < dstW - 15; i += 16) { + +vi = vec_vsx_ld(0, &src[i]); +vi = vec_adds(ditherleft, vi); +vileft = vec_sra(vi, shifts); + +vi = vec_vsx_ld(0, &src[i + 8]); +vi = vec_adds(ditherright, vi); +vi = vec_sra(vi, shifts); + +vd = vec_packsu(vileft, vi); +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_8_u(src, dest, dstW, dither, offset, i); +} + #endif /* HAVE_ALTIVEC */ av_cold void ff_sws_init_swscale_ppc(SwsContext *c) @@ -367,6 +414,12 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c) c->yuv2packedX = ff_yuv2rgb24_X_altivec; break; } + +switch (c->dstBpc) { +case 8: +c->yuv2plane1 = yuv2plane1_8_altivec; +break; +} } #endif /* HAVE_ALTIVEC */ } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize yuv2plane1_8
On Sat, 17 Nov 2018 15:20:08 +0100 Carl Eugen Hoyos wrote: > 2018-11-17 9:09 GMT+01:00, Lauri Kasanen : > > Carl Eugen Hoyos wrote: > >> (This is less important atm, but I believe all functions currently > >> in libswscale/ppc compile and run fine on - old - 32bit be hardware > >> as your new function does. > >> My completely inexperienced suspicion is that the instruction that > >> you call "VSX" also exists on Altivec.) > > > > Ref > > http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06 > > > > VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7. > > The instruction vec_vsx_ld is currently only used for little-endian ppc > which I thought did not exist before power7, am I wrong? Looks like there were LE Powers like the 440 already in 1999: https://lwn.net/Articles/408051/ datasheets.chipdb.org/IBM/PowerPC/440/PowerPC-440-Core.pdf - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p > \ > -f null -vframes 100 -v error -nostats - > > 1158 UNITS in planar1, 65528 runs, 8 skips > > -cpuflags 0 > > 19082 UNITS in planar1, 65533 runs, 3 skips > > 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version > takes as many cycles as the x86 SSE2 version, yikes it's fast. > > Note that this function uses VSX instructions, but is not marked so. > This is because several existing functions also make that mistake. > I'll submit a patch moving them once this is reviewed. > > v2: Remove !BE check > Signed-off-by: Lauri Kasanen Ping. Seems not many ffmpeg devs interested in ppc. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
On Wed, 21 Nov 2018 13:21:58 +0100 Michael Niedermayer wrote: > On Wed, Nov 21, 2018 at 10:12:48AM +0200, Lauri Kasanen wrote: > > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > > > yuv420p \ > > > -f null -vframes 100 -v error -nostats - > > > > > > 1158 UNITS in planar1, 65528 runs, 8 skips > > > > > > -cpuflags 0 > > > > > > 19082 UNITS in planar1, 65533 runs, 3 skips > > > > > > 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version > > > takes as many cycles as the x86 SSE2 version, yikes it's fast. > > > > > > Note that this function uses VSX instructions, but is not marked so. > > > This is because several existing functions also make that mistake. > > > I'll submit a patch moving them once this is reviewed. > > > > > > v2: Remove !BE check > > > Signed-off-by: Lauri Kasanen > > > > Ping. Seems not many ffmpeg devs interested in ppc. > > have you tried "make fate" with this patch (note you need to configure with > fate samples" so all tests are run I ran those fate tests containing "scale" in the name, I gather the full suite takes > 20min. Otherwise I tested with a PNG to video conversion on LE, and Carl Eugen Hoyos tested with Lena on BE. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
On Wed, 21 Nov 2018 17:22:36 +0100 Michael Niedermayer wrote: > the full fate tests must be run, many of these tests use swscale without > having "scale" in their name > and yes on lower end hardware 20min and longer is possible I get failures on the baseline, without my patch. What is the procedure here? Is there a var to skip those tests, or? First I ran with THREADS=3, baseline blew up in fate-h264-conformance-frext-hpcafl_bcrm_c Then I ran without THREADS, it got further, but blew up in fate-rv20-1239 - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
On Fri, 23 Nov 2018 03:26:50 +0100 Michael Niedermayer wrote: > On Wed, Nov 21, 2018 at 07:19:45PM +0200, Lauri Kasanen wrote: > > On Wed, 21 Nov 2018 17:22:36 +0100 > > Michael Niedermayer wrote: > > > the full fate tests must be run, many of these tests use swscale without > > > having "scale" in their name > > > and yes on lower end hardware 20min and longer is possible > > > > I get failures on the baseline, without my patch. What is the procedure > > here? Is there a var to skip those tests, or? > > procedure ? > First i try to convince you to attempt to fix some of these failures ;) > because well, everyone would benefit if they are fixed ... I mean, if my patch adds no failures, is that enough to apply it? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
On Fri, 23 Nov 2018 23:01:02 +0100 Michael Niedermayer wrote: > On Fri, Nov 23, 2018 at 10:38:13AM +0200, Lauri Kasanen wrote: > > I mean, if my patch adds no failures, is that enough to apply it? > > yes that and the tests failing should still fail the same way with the > same checksums > This of course assumes noone finds an issue in the patch Okay, ran both with -k. No new failures, and fate-rv20-1239 failed with the same checksums in both cases. That was the only failing test, did not try with THREADS. Curiously "make CPUFLAGS=0 fate-rv20-1239" also fails, so it's not Altivec code that breaks that test, but C (?). - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug
Hi, The lone power8 fate failing test seems like an aliasing issue. I've isolated it into the attached standalone test case. Compiling it with gcc -std=c11 -maltivec -mabi=altivec -mvsx -O3 -fno-tree-vectorize -o test test.c reproduces on gcc 8.2.0, dropping the optimization level fixes it. This was one of the "adding a printf made it work" things too. -Wstrict-aliasing=1 complains about the "register int *idataptr = (int*)dataptr;" cast. If I put "typedef int __attribute__((may_alias)) int_alias;" at the top and change the cast and type to int_alias, the results become correct. This code would probably crash on systems where unaligned access is prohibited, I think the incoming block is just 16-bit aligned. How do you prefer to fix alignment/aliasing issues? - Lauri test.c Description: Binary data ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug
On Sun, 25 Nov 2018 17:17:58 +0200 Lauri Kasanen wrote: > This code would probably crash on systems where unaligned access is > prohibited, I think the incoming block is just 16-bit aligned. I see the block comes from aligned malloc, so scratch that part, it's at least 128-bit aligned. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8
On Mon, 26 Nov 2018 11:03:55 +0300 Michael Kostylev wrote: > > http://fate.xffm.org/?sort=arch > /ppc Yeah, mentioned in the commit message. Follow-up patch coming today. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug
On Mon, 26 Nov 2018 00:45:26 +0100 Carl Eugen Hoyos wrote: > 2018-11-25 16:17 GMT+01:00, Lauri Kasanen : > > Hi, > > > > The lone power8 fate failing test seems like an aliasing issue. > > I've isolated it into the attached standalone test case. Compiling it > > with > > gcc -std=c11 -maltivec -mabi=altivec -mvsx -O3 -fno-tree-vectorize > > -o test test.c > > > > reproduces on gcc 8.2.0, dropping the optimization level fixes it. This > > was one of the "adding a printf made it work" things too. > > > > -Wstrict-aliasing=1 complains about the "register int *idataptr = > > (int*)dataptr;" cast. If I put "typedef int __attribute__((may_alias)) > > int_alias;" at the top and change the cast and type to int_alias, the > > results become correct. > > Thank you for the analysis! > > Patch attached, Carl Eugen Tested, fixes the fate test for me. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). Can anyone test BE? Signed-off-by: Lauri Kasanen --- libswscale/ppc/Makefile | 1 + libswscale/ppc/swscale_altivec.c | 291 ++ libswscale/ppc/swscale_ppc_template.c | 217 + libswscale/ppc/swscale_vsx.c | 164 +++ libswscale/swscale_internal.h | 1 + 5 files changed, 393 insertions(+), 281 deletions(-) create mode 100644 libswscale/ppc/swscale_ppc_template.c create mode 100644 libswscale/ppc/swscale_vsx.c diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile index d1b596e..0a31a30 100644 --- a/libswscale/ppc/Makefile +++ b/libswscale/ppc/Makefile @@ -1,3 +1,4 @@ OBJS += ppc/swscale_altivec.o \ ppc/yuv2rgb_altivec.o \ ppc/yuv2yuv_altivec.o \ +ppc/swscale_vsx.o diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 8c6056d..1d2b2fa 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -31,21 +31,14 @@ #include "yuv2rgb_altivec.h" #include "libavutil/ppc/util_altivec.h" -#if HAVE_ALTIVEC +#if HAVE_ALTIVEC && HAVE_BIGENDIAN #define vzero vec_splat_s32(0) -#if HAVE_BIGENDIAN #define GET_LS(a,b,c,s) {\ vector signed short l2 = vec_ld(((b) << 1) + 16, s);\ ls = vec_perm(a, l2, c);\ a = l2;\ } -#else -#define GET_LS(a,b,c,s) {\ -ls = a;\ -a = vec_vsx_ld(((b) << 1) + 16, s);\ -} -#endif #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\ vector signed short ls;\ @@ -59,7 +52,6 @@ d2 = vec_add(d2, vf2);\ } while (0) -#if HAVE_BIGENDIAN #define LOAD_FILTER(vf,f) {\ vector unsigned char perm0 = vec_lvsl(joffset, f);\ vf = vec_ld(joffset, f);\ @@ -69,89 +61,7 @@ p = vec_lvsl(xoffset, s);\ ll1 = vec_ld(xoffset, s);\ } -#else -#define LOAD_FILTER(vf,f) {\ -vf = vec_vsx_ld(joffset, f);\ -} -#define LOAD_L1(ll1,s,p){\ -ll1 = vec_vsx_ld(xoffset, s);\ -} -#endif - -static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, - const uint8_t *dither, int offset, int x) -{ -register int i, j; -LOCAL_ALIGNED(16, int, val, [16]); -vector signed int vo1, vo2, vo3, vo4; -vector unsigned short vs1, vs2; -vector unsigned char vf; -vector unsigned int altivec_vectorShiftInt19 = -vec_add(vec_splat_u32(10), vec_splat_u32(9)); - -for (i = 0; i < 16; i++) -val[i] = dither[(x + i + offset) & 7] << 12; - -vo1 = vec_ld(0, val); -vo2 = vec_ld(16, val); -vo3 = vec_ld(32, val); -vo4 = vec_ld(48, val); - -for (j = 0; j < filterSize; j++) { -unsigned int joffset=j<<1; -unsigned int xoffset=x<<1; -vector unsigned char perm; -vector signed short l1,vLumFilter; -LOAD_FILTER(vLumFilter,filter); -vLumFilter = vec_splat(vLumFilter, 0); -LOAD_L1(l1,src[j],perm); -yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter); -yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter); -} - -vo1 = vec_sra(vo1, altivec_vectorShiftInt19); -vo2 = vec_sra(vo2, altivec_vectorShiftInt19); -vo3 = vec_sra(vo3, altivec_vectorShiftInt19); -vo4 = vec_sra(vo4, altivec_vectorShiftInt19); -vs1 = vec_packsu(vo1, vo2); -vs2 = vec_packsu(vo3, vo4); -vf = vec_packsu(vs1, vs2); -VEC_ST(vf, 0, dest); -} - - -static inline void yuv2planeX_u(const int16_t *filter, int filterSize, -const int16_t **src, uint8_t *dest, int dstW, -const uint8_t *dither, int offset, int x) -{ -int i, j; - -for (i = x; i < dstW; i++) { -int t = dither[(i + offset) & 7] << 12; -for (j = 0; j < filterSize; j++) -t += src[j][i] * filter[j]; -dest[i] = av_clip_uint8(t >> 19); -} -} - -static void yuv2planeX_altivec(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -int dst_u = -(uintptr_t)dest & 15; -int i; - -yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); - -for (i = dst_u; i < dstW - 15; i += 16) -yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither, - offset, i); - -yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); -} -#if HAVE_BIGENDIAN // The 3 above is 2 (filterSize =
[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p9le \ -f null -vframes 100 -v error -nostats - Speedups: yuv2plane1_9BE_vsx 11.2042 yuv2plane1_9LE_vsx 11.156 yuv2plane1_10BE_vsx 9.89428 yuv2plane1_10LE_vsx 10.3637 yuv2plane1_12BE_vsx 9.71923 yuv2plane1_12LE_vsx 11.0404 yuv2plane1_14BE_vsx 10.1763 yuv2plane1_14LE_vsx 11.2728 Fate passes, each format tested with an image to video conversion. Depends on "swscale/ppc: Move VSX-using code to its own file". Only tested on LE. Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 83 1 file changed, 83 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 853b587..6462c11 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -131,6 +131,75 @@ static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW, yuv2plane1_8_u(src, dest, dstW, dither, offset, i); } +#if !HAVE_BIGENDIAN + +#define output_pixel(pos, val) \ +if (big_endian) { \ +AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \ +} else { \ +AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \ +} + +static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 15 - output_bits; + +for (i = start; i < dstW; i++) { +int val = src[i] + (1 << (shift - 1)); +output_pixel(&dest[i], val); +} +} + +static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 15 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add}; +const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); +const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift); +const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip}; +vector uint16_t v; +int i; + +yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0); + +for (i = dst_u; i < dstW - 7; i += 8) { +v = vec_vsx_ld(0, (const uint16_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); +v = vec_min(v, vlargest); +v = vec_rl(v, vswap); +vec_st(v, 0, &dest[i]); +} + +yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); +} + +#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ +static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \ + uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset) \ +{ \ +yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \ + (uint16_t *) dest, dstW, is_be, bits); \ +} + +yuv2NBPS( 9, BE, 1, nbps, int16_t) +yuv2NBPS( 9, LE, 0, nbps, int16_t) +yuv2NBPS(10, BE, 1, nbps, int16_t) +yuv2NBPS(10, LE, 0, nbps, int16_t) +yuv2NBPS(12, BE, 1, nbps, int16_t) +yuv2NBPS(12, LE, 0, nbps, int16_t) +yuv2NBPS(14, BE, 1, nbps, int16_t) +yuv2NBPS(14, LE, 0, nbps, int16_t) + +#endif /* !HAVE_BIGENDIAN */ + #endif /* HAVE_VSX */ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) @@ -158,6 +227,20 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) case 8: c->yuv2plane1 = yuv2plane1_8_vsx; break; +#if !HAVE_BIGENDIAN +case 9: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx; +break; +case 10: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx; +break; +case 12: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx; +break; +case 14: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx; +break; +#endif } } #endif /* HAVE_VSX */ -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
On Mon, 26 Nov 2018 14:24:15 +0200 Lauri Kasanen wrote: > Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). > Can anyone test BE? Ping. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
On Fri, 30 Nov 2018 12:30:58 +0300 Michael Kostylev wrote: > > >> Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" > >> applied). Can anyone test BE? > > > > Ping. > > FATE becomes green as much as possible, I haven't performed any benchmarking > though. Thanks for testing. This patch is not expected to change performance, it's just moving functions around and putting them under proper VSX guards. - Lauri PS: Your mail did not make it to the list, was it meant for me only? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
On Fri, 30 Nov 2018 14:05:26 +0200 Lauri Kasanen wrote: > On Fri, 30 Nov 2018 12:30:58 +0300 > Michael Kostylev wrote: > > > > >> Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" > > >> applied). Can anyone test BE? > > > > > > Ping. > > > > FATE becomes green as much as possible, I haven't performed any > > benchmarking though. > > Thanks for testing. This patch is not expected to change performance, > it's just moving functions around and putting them under proper VSX > guards. Could this patch be applied? Also ping on "swscale/output: VSX-optimize nbps yuv2plane1". - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
On Tue, 4 Dec 2018 03:21:30 +0100 Michael Niedermayer wrote: > On Mon, Dec 03, 2018 at 09:24:47AM +0200, Lauri Kasanen wrote: > > Also ping on "swscale/output: VSX-optimize > > nbps yuv2plane1". > > This IIUC has not been tested on BE yet > > my ppc emulation setup is a bit broken and my ppc hw ive not tried using > since years and it was not in good shape last i used it. > So i cant just quickly test this ... Raptor offers free POWER9 VMs to open source projects. Since you're the leader of ffmpeg, if you asked, I'm sure they'd give one or two for ffmpeg build and fate testing. Ref https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v "We offer free access to cloud VPS for libre software projects in partnership with @Integricloud, would that help?" "Contact sa...@integricloud.com and tell them what you want to use a VPS or two for. They will generally grant access to the resources." (I'm developing on a POWER8 VM intended for devs, but ordered a Blackbird from the cyber monday sale ;)) - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1
On Thu, 6 Dec 2018 22:36:01 +0100 Carl Eugen Hoyos wrote: > 2018-11-27 14:26 GMT+01:00, Lauri Kasanen : > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > > yuv420p9le \ > > -f null -vframes 100 -v error -nostats - > > > > Speedups: > > yuv2plane1_9BE_vsx 11.2042 > > yuv2plane1_9LE_vsx 11.156 > > yuv2plane1_10BE_vsx 9.89428 > > yuv2plane1_10LE_vsx 10.3637 > > yuv2plane1_12BE_vsx 9.71923 > > yuv2plane1_12LE_vsx 11.0404 > > yuv2plane1_14BE_vsx 10.1763 > > yuv2plane1_14LE_vsx 11.2728 > > > > Fate passes, each format tested with an image to video conversion. > > > > Depends on "swscale/ppc: Move VSX-using code to its own file". > > > Only tested on LE. > > This patch breaks output on BE, tested with fate-v410enc and: > $ ffmpeg -i fate-suite/lena.pnm -pix_fmt yuv420p10 -vcodec ffv1 out.nut Just checking, was that with the !BE guards removed? Otherwise I don't see how it could affect BE? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1
On Fri, 7 Dec 2018 13:50:12 +0100 Carl Eugen Hoyos wrote: > > Carl Eugen Hoyos wrote: > >> 2018-11-27 14:26 GMT+01:00, Lauri Kasanen : > >> > Fate passes, each format tested with an image to video conversion. > >> > > >> > Depends on "swscale/ppc: Move VSX-using code to its own file". > >> > >> > Only tested on LE. > >> > >> This patch breaks output on BE, tested with fate-v410enc and: > >> $ ffmpeg -i fate-suite/lena.pnm -pix_fmt yuv420p10 -vcodec ffv1 out.nut > > > > Just checking, was that with the !BE guards removed? > > Correct, sorry for being unclear. > > > Otherwise I don't see how it could affect BE? > > Yes. Okay, so it otherwise didn't affect BE. Can it be applied, or is BE a requirement? This is a simple function, and I can guess how to change it, but for future more complex functions I rather don't want to blindly try. LE is the common case for newer POWER really, many distros don't even support BE. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file
On Thu, 6 Dec 2018 21:47:18 +0100 Michael Niedermayer wrote: > On Tue, Dec 04, 2018 at 02:27:22PM +0100, Michael Niedermayer wrote: > > > > > On Mon, Dec 03, 2018 at 09:24:47AM +0200, Lauri Kasanen wrote: > > > > > > Also ping on "swscale/output: VSX-optimize > > > > > > nbps yuv2plane1". > > > > > > > > > > This IIUC has not been tested on BE yet > > > > > > > > > > my ppc emulation setup is a bit broken and my ppc hw ive not tried > > > > > using > > > > > since years and it was not in good shape last i used it. > > > > > So i cant just quickly test this ... > > these are more suggestions than i expected :) > > but i just got cross build working again and i also just eliminated a > > mysterious ld.so related segfault > > ATM iam re rerunning fate with a freshly rebuilt qemu > > (the past one had an issue with altivec) > > i have cross build with ppc and qemu partly working > but it appears gcc or something is just buggy Hi, Carl Eugen Hoyos reported that it builds fine on BE, the guards being in correct place not to affect BE. How are things on your side? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 16-bit yuv2plane1
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16le \ -f null -vframes 100 -v error -nostats - 19157 UNITS in planar1, 65512 runs, 24 skips -cpuflags 0 2120 UNITS in planar1, 65393 runs,143 skips 9.03632 speedup, 16be similarly. Fate passes, each format tested with an image to video conversion. Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_vsx.c | 59 1 file changed, 59 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 6462c11..70da6ae 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -180,6 +180,60 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +#undef output_pixel + +#define output_pixel(pos, val, bias, signedness) \ +if (big_endian) { \ +AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} else { \ +AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} + +static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +const int shift = 3; + +for (i = start; i < dstW; i++) { +int val = src[i] + (1 << (shift - 1)); +output_pixel(&dest[i], val, 0, uint); +} +} + +static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 3; +const int add = (1 << (shift - 1)); +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +vector uint32_t v, v2; +vector uint16_t vd; +int i; + +yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0); + +for (i = dst_u; i < dstW - 7; i += 8) { +v = vec_vsx_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); + +v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]); +v2 = vec_add(v2, vadd); +v2 = vec_sr(v2, vshift); + +vd = vec_packsu(v, v2); +vd = vec_rl(vd, vswap); + +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); +} + #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \ uint8_t *dest, int dstW, \ @@ -197,6 +251,8 @@ yuv2NBPS(12, BE, 1, nbps, int16_t) yuv2NBPS(12, LE, 0, nbps, int16_t) yuv2NBPS(14, BE, 1, nbps, int16_t) yuv2NBPS(14, LE, 0, nbps, int16_t) +yuv2NBPS(16, BE, 1, 16, int32_t) +yuv2NBPS(16, LE, 0, 16, int32_t) #endif /* !HAVE_BIGENDIAN */ @@ -240,6 +296,9 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) case 14: c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx; break; +case 16: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx; +break; #endif } } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v2] swscale/output: VSX-optimize 16-bit yuv2plane1
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16le \ -f null -vframes 100 -v error -nostats - 2120 UNITS in planar1, 65393 runs,143 skips -cpuflags 0 19157 UNITS in planar1, 65512 runs, 24 skips 9.03632 speedup, 16be similarly. Fate passes, each format tested with an image to video conversion. Signed-off-by: Lauri Kasanen --- v2: Copy-pasted rows were flipped. libswscale/ppc/swscale_vsx.c | 59 1 file changed, 59 insertions(+) diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 6462c11..70da6ae 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -180,6 +180,60 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +#undef output_pixel + +#define output_pixel(pos, val, bias, signedness) \ +if (big_endian) { \ +AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} else { \ +AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} + +static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +const int shift = 3; + +for (i = start; i < dstW; i++) { +int val = src[i] + (1 << (shift - 1)); +output_pixel(&dest[i], val, 0, uint); +} +} + +static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW, + int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 3; +const int add = (1 << (shift - 1)); +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +vector uint32_t v, v2; +vector uint16_t vd; +int i; + +yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0); + +for (i = dst_u; i < dstW - 7; i += 8) { +v = vec_vsx_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); + +v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]); +v2 = vec_add(v2, vadd); +v2 = vec_sr(v2, vshift); + +vd = vec_packsu(v, v2); +vd = vec_rl(vd, vswap); + +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); +} + #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \ uint8_t *dest, int dstW, \ @@ -197,6 +251,8 @@ yuv2NBPS(12, BE, 1, nbps, int16_t) yuv2NBPS(12, LE, 0, nbps, int16_t) yuv2NBPS(14, BE, 1, nbps, int16_t) yuv2NBPS(14, LE, 0, nbps, int16_t) +yuv2NBPS(16, BE, 1, 16, int32_t) +yuv2NBPS(16, LE, 0, 16, int32_t) #endif /* !HAVE_BIGENDIAN */ @@ -240,6 +296,9 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c) case 14: c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx; break; +case 16: +c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx; +break; #endif } } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2 0/2] AltiVec/VSX fixes in swscale
Hi, I'll apply these in a couple days if no objections. Works ok in my tests. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 0/2] AltiVec/VSX fixes in swscale
On Tue, 1 Oct 2019 18:26:20 +0300 Lauri Kasanen wrote: > Hi, > > I'll apply these in a couple days if no objections. Works ok in my > tests. Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] What new instructions would you like?
Hi, For the Libre RISC-V project, I'm going to research the popular codecs and design new instructions to help speed them up. With ffmpeg being home to lots of asm folks for many platforms, I also want to ask your opinion. What new instructions would you like? Anything particular you find missing in existing ISAs, slow, or cumbersome? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] What new instructions would you like?
On Sat, 1 Feb 2020 12:53:28 +0100 James Darnley wrote: > On 30/12/2019, Lauri Kasanen wrote: > > For the Libre RISC-V project, I'm going to research the popular codecs > > and design new instructions to help speed them up. With ffmpeg being > > home to lots of asm folks for many platforms, I also want to ask your > > opinion. > > > > What new instructions would you like? Anything particular you find > > missing in existing ISAs, slow, or cumbersome? > > Do you mean SIMD instructions? I have no idea what exists in RISC-V > already or what capabilities or limitations it has, and I am going to > use x86 language and terms such as byte, word, dword, qword. > > Things I have found missing in old(er) x86 instruction sets are > missing word size and signed/unsigned variants for existing > operations. Some operations may have byte and word variants but dword > and qword might be missing, or there might be a signed version but not > an unsigned version (and vice versa). A couple of things I had to > emulate: > * packed absolute value of dwords > * packed maximum unsigned words > * packed max and min signed dwords (I might have really wanted > unsigned for this) > * arithmetic right shift of qwords > * pack dwords to words with unsigned saturation > > Shuffle instructions. pshufb is very useful and I think I read on IRC > that arm/aarch64/neon does not have an equivalent. (Or was that other > shuffles?) It allows for arbitrary reordering of bytes and setting > bytes to 0. On x86 it takes the shuffle pattern from another SIMD > register but I usually use it with a constant pattern that gets loaded > from memory. An interesting improvement would be if you can encode 17 > * 16 (or however long your vectors might be) values in an immediate > value so it doesn't require another register. > > Good documentation. The intel instruction manual has pretty good > explanation of what the instructions do. The old instructions from > around the time of MMX and SSE had excellent diagrams, these might > have been mostly for shuffle operations. I need to look and jog my > memory. I think punpcklbw is an example of what I mean. The entry in > the manual for it has a good diagram IMO. (At least the version I am > currently looking at) > > No stupid lane stuff. AVX2 brought us a SIMD vector length extension > from 16 to 32 bytes. Good except for the stupid lanes they were split > into making it hard to "mix" data from the low 0-15 bytes and the high > 16-31 bytes. > > I forgot about this email for a month. Sorry about that. Seeing > RISC-V in the schedule at FOSDEM reminded me about this. Thanks for your thoughts. The project scope is both SIMD and scalar, if there's for example a particular bit packing that's slow and unparallelizable, it might benefit from a dedicated instruction. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize float yuv2plane1
This function wouldn't benefit from VSX instructions, and input and output share alignment, so I put it under altivec. ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt grayf32le \ -f null -vframes 100 -v error -nostats - 3743 UNITS in planar1, 65495 runs, 41 skips -cpuflags 0 23511 UNITS in planar1, 65530 runs, 6 skips grayf32be 4647 UNITS in planar1, 65449 runs, 87 skips -cpuflags 0 28608 UNITS in planar1, 65530 runs, 6 skips The native speedup is 6.28133, and the bswapping one 6.15623. Fate passes, each format tested with an image to video conversion. Signed-off-by: Lauri Kasanen --- Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated. libswscale/ppc/swscale_altivec.c | 139 ++- 1 file changed, 137 insertions(+), 2 deletions(-) diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 1d2b2fa..2ef5257 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -31,7 +31,8 @@ #include "yuv2rgb_altivec.h" #include "libavutil/ppc/util_altivec.h" -#if HAVE_ALTIVEC && HAVE_BIGENDIAN +#if HAVE_ALTIVEC +#if HAVE_BIGENDIAN #define vzero vec_splat_s32(0) #define GET_LS(a,b,c,s) {\ @@ -102,7 +103,135 @@ #include "swscale_ppc_template.c" #undef FUNC -#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */ +#endif /* HAVE_BIGENDIAN */ + +#define output_pixel(pos, val, bias, signedness) \ +if (big_endian) { \ +AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} else { \ +AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} + +static void +yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start) +{ +static const int big_endian = HAVE_BIGENDIAN; +static const int shift = 3; +static const float float_mult = 1.0f / 65535.0f; +int i, val; +uint16_t val_uint; + +for (i = start; i < dstW; ++i){ +val = src[i] + (1 << (shift - 1)); +output_pixel(&val_uint, val, 0, uint); +dest[i] = float_mult * (float)val_uint; +} +} + +static void +yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start) +{ +static const int big_endian = HAVE_BIGENDIAN; +static const int shift = 3; +static const float float_mult = 1.0f / 65535.0f; +int i, val; +uint16_t val_uint; + +for (i = start; i < dstW; ++i){ +val = src[i] + (1 << (shift - 1)); +output_pixel(&val_uint, val, 0, uint); +dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint)); +} +} + +static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW) +{ +const int dst_u = -(uintptr_t)dest & 3; +const int shift = 3; +const int add = (1 << (shift - 1)); +const int clip = (1 << 16) - 1; +const float fmult = 1.0f / 65535.0f; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip}; +const vector float vmul = (vector float) {fmult, fmult, fmult, fmult}; +const vector float vzero = (vector float) {0, 0, 0, 0}; +vector uint32_t v; +vector float vd; +int i; + +yuv2plane1_float_u(src, dest, dst_u, 0); + +for (i = dst_u; i < dstW - 3; i += 4) { +v = vec_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); +v = vec_min(v, vlargest); + +vd = vec_ctf(v, 0); +vd = vec_madd(vd, vmul, vzero); + +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_float_u(src, dest, dstW, i); +} + +static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW) +{ +const int dst_u = -(uintptr_t)dest & 3; +const int shift = 3; +const int add = (1 << (shift - 1)); +const int clip = (1 << 16) - 1; +const float fmult = 1.0f / 65535.0f; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip}; +const vector float vmul = (vector float) {fmult, fmult, fmult, fmult}; +const vector float vzero = (vector float) {0, 0, 0, 0}; +const vector uint32_t vswapbig = (vector uint32_t) {16, 16, 16, 16}; +const vector uint16_t vswapsmall = vec_splat_u16(8); +vector uint32_t v; +vector float vd; +int i; + +yuv2plane1_float_bswap_u(src, dest, dst_u, 0); + +for (i = dst_u; i < dstW - 3; i += 4) { +v = vec_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); +v = vec_min(v, vlargest); + +vd =
Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize float yuv2plane1
On Sun, 16 Dec 2018 00:22:00 +0100 Michael Niedermayer wrote: > On Sat, Dec 15, 2018 at 06:32:31PM +0200, Lauri Kasanen wrote: > > Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated. > > > > libswscale/ppc/swscale_altivec.c | 139 > > ++- > > 1 file changed, 137 insertions(+), 2 deletions(-) > > breaks build: > src/libswscale/ppc/swscale_altivec.c: In function ‘yuv2plane1_float_altivec’: > src/libswscale/ppc/swscale_altivec.c:158:80: error: expected declaration > specifiers or ‘...’ before ‘(’ token > const vector float vzero = (vector float) {0, 0, 0, 0}; Thanks for testing. I missed the vzero define at the top, I wonder why my gcc did not break. Patch v2 coming. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1
This function wouldn't benefit from VSX instructions, so I put it under altivec. ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt grayf32le \ -f null -vframes 100 -v error -nostats - 3743 UNITS in planar1, 65495 runs, 41 skips -cpuflags 0 23511 UNITS in planar1, 65530 runs, 6 skips grayf32be 4647 UNITS in planar1, 65449 runs, 87 skips -cpuflags 0 28608 UNITS in planar1, 65530 runs, 6 skips The native speedup is 6.28133, and the bswapping one 6.15623. Fate passes, each format tested with an image to video conversion. Signed-off-by: Lauri Kasanen --- Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated. v2: Added #undef vzero, that define broke the build on older gcc. Thanks Michael libswscale/ppc/swscale_altivec.c | 141 ++- 1 file changed, 139 insertions(+), 2 deletions(-) diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index 1d2b2fa..d72ed1e 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -31,7 +31,8 @@ #include "yuv2rgb_altivec.h" #include "libavutil/ppc/util_altivec.h" -#if HAVE_ALTIVEC && HAVE_BIGENDIAN +#if HAVE_ALTIVEC +#if HAVE_BIGENDIAN #define vzero vec_splat_s32(0) #define GET_LS(a,b,c,s) {\ @@ -102,7 +103,137 @@ #include "swscale_ppc_template.c" #undef FUNC -#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */ +#undef vzero + +#endif /* HAVE_BIGENDIAN */ + +#define output_pixel(pos, val, bias, signedness) \ +if (big_endian) { \ +AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} else { \ +AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \ +} + +static void +yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start) +{ +static const int big_endian = HAVE_BIGENDIAN; +static const int shift = 3; +static const float float_mult = 1.0f / 65535.0f; +int i, val; +uint16_t val_uint; + +for (i = start; i < dstW; ++i){ +val = src[i] + (1 << (shift - 1)); +output_pixel(&val_uint, val, 0, uint); +dest[i] = float_mult * (float)val_uint; +} +} + +static void +yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start) +{ +static const int big_endian = HAVE_BIGENDIAN; +static const int shift = 3; +static const float float_mult = 1.0f / 65535.0f; +int i, val; +uint16_t val_uint; + +for (i = start; i < dstW; ++i){ +val = src[i] + (1 << (shift - 1)); +output_pixel(&val_uint, val, 0, uint); +dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint)); +} +} + +static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW) +{ +const int dst_u = -(uintptr_t)dest & 3; +const int shift = 3; +const int add = (1 << (shift - 1)); +const int clip = (1 << 16) - 1; +const float fmult = 1.0f / 65535.0f; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip}; +const vector float vmul = (vector float) {fmult, fmult, fmult, fmult}; +const vector float vzero = (vector float) {0, 0, 0, 0}; +vector uint32_t v; +vector float vd; +int i; + +yuv2plane1_float_u(src, dest, dst_u, 0); + +for (i = dst_u; i < dstW - 3; i += 4) { +v = vec_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v, vshift); +v = vec_min(v, vlargest); + +vd = vec_ctf(v, 0); +vd = vec_madd(vd, vmul, vzero); + +vec_st(vd, 0, &dest[i]); +} + +yuv2plane1_float_u(src, dest, dstW, i); +} + +static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW) +{ +const int dst_u = -(uintptr_t)dest & 3; +const int shift = 3; +const int add = (1 << (shift - 1)); +const int clip = (1 << 16) - 1; +const float fmult = 1.0f / 65535.0f; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); +const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, clip}; +const vector float vmul = (vector float) {fmult, fmult, fmult, fmult}; +const vector float vzero = (vector float) {0, 0, 0, 0}; +const vector uint32_t vswapbig = (vector uint32_t) {16, 16, 16, 16}; +const vector uint16_t vswapsmall = vec_splat_u16(8); +vector uint32_t v; +vector float vd; +int i; + +yuv2plane1_float_bswap_u(src, dest, dst_u, 0); + +for (i = dst_u; i < dstW - 3; i += 4) { +v = vec_ld(0, (const uint32_t *) &src[i]); +v = vec_add(v, vadd); +v = vec_sr(v,
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1
On Mon, 17 Dec 2018 01:03:36 +0100 Carl Eugen Hoyos wrote: > 2018-12-16 10:06 GMT+01:00, Lauri Kasanen : > > This function wouldn't benefit from VSX instructions, so I put it > > under altivec. > > > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > > grayf32le \ > > -f null -vframes 100 -v error -nostats - > > > > 3743 UNITS in planar1, 65495 runs, 41 skips > > > > -cpuflags 0 > > > > 23511 UNITS in planar1, 65530 runs, 6 skips > > > > grayf32be > > > > 4647 UNITS in planar1, 65449 runs, 87 skips > > > > -cpuflags 0 > > > > 28608 UNITS in planar1, 65530 runs, 6 skips > > > > The native speedup is 6.28133, and the bswapping one 6.15623. > > > Fate passes > > I wonder a little how, given that grayf32 already breaks fate as-is... Are the tests for it disabled? fate.ffmpeg.org reports 100% success for many platforms. > Note that this function / this pix_fmt currently has no real use-case > afaict. Is there a list of which pix fmts are useful? Of course I don't want to waste both my and reviewers' time, if the format is considered for removal or otherwise broken. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1
On Mon, 17 Dec 2018 14:52:49 +0100 Carl Eugen Hoyos wrote: > >> Note that this function / this pix_fmt currently has no real use-case > >> afaict. > > > > Is there a list of which pix fmts are useful? Of course I don't want to > > waste both my and reviewers' time, if the format is considered for > > removal or otherwise broken. > > The pix_fmt is not deprecated (it's new), what I meant was that it is > currently only used for obscure monochrome Photoshop images > and one filter, so I am not sure optimizing this colour conversion > will help often. Oh, thanks for the clarification. I'm going roughly in difficulty order, doing the easy functions first. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1
On Sun, 16 Dec 2018 11:06:53 +0200 Lauri Kasanen wrote: > This function wouldn't benefit from VSX instructions, so I put it > under altivec. > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > grayf32le \ > -f null -vframes 100 -v error -nostats - > > 3743 UNITS in planar1, 65495 runs, 41 skips > > -cpuflags 0 > > 23511 UNITS in planar1, 65530 runs, 6 skips > > grayf32be > > 4647 UNITS in planar1, 65449 runs, 87 skips > > -cpuflags 0 > > 28608 UNITS in planar1, 65530 runs, 6 skips > > The native speedup is 6.28133, and the bswapping one 6.15623. > Fate passes, each format tested with an image to video conversion. > > Signed-off-by: Lauri Kasanen > --- > > Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated. > > v2: Added #undef vzero, that define broke the build on older gcc. Thanks > Michael Ping. And of course it's not gcc version dependant, but rather it was the BE ifdef; it was too early in the morning. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. yuv420p9le 12341 UNITS in planarX, 130976 runs, 96 skips 73752 UNITS in planarX, 131066 runs, 6 skips yuv420p9be 12364 UNITS in planarX, 131025 runs, 47 skips 73001 UNITS in planarX, 131055 runs, 17 skips yuv420p10le 12386 UNITS in planarX, 131042 runs, 30 skips 72735 UNITS in planarX, 131062 runs, 10 skips yuv420p10be 12337 UNITS in planarX, 131045 runs, 27 skips 72734 UNITS in planarX, 131057 runs, 15 skips yuv420p12le 12236 UNITS in planarX, 131058 runs, 14 skips 73029 UNITS in planarX, 131062 runs, 10 skips yuv420p12be 12218 UNITS in planarX, 130973 runs, 99 skips 72402 UNITS in planarX, 131069 runs, 3 skips yuv420p14le 12168 UNITS in planarX, 131067 runs, 5 skips 72480 UNITS in planarX, 131069 runs, 3 skips yuv420p14be 12358 UNITS in planarX, 130948 runs,124 skips 73772 UNITS in planarX, 131063 runs, 9 skips yuv420p16le 10439 UNITS in planarX, 130911 runs,161 skips 157923 UNITS in planarX, 131068 runs, 4 skips yuv420p16be 10463 UNITS in planarX, 130874 runs,198 skips 154405 UNITS in planarX, 131061 runs, 11 skips Signed-off-by: Lauri Kasanen --- The existing VSX yuv2plane1 is also ifdefed out for POWER7, even though it works there. This is for cleanliness mainly, separating the macros would be a bit uglier. If we have POWER7 users who need that one, please speak up. libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 177 +- 2 files changed, 178 insertions(+), 3 deletions(-) diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..baca36c 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_t swap = big_endian ? 8 : 0; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift}; +const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap}; +const vector uint16_t vlargest = (vector uint16_
Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 9-16 bit yuv2planeX
On Sun, 6 Jan 2019 13:23:43 +0100 Carl Eugen Hoyos wrote: > 2019-01-04 20:43 GMT+01:00, Lauri Kasanen : > > +#ifdef __POWER8_VECTOR__ > > If this is correct, I assume it fixes a bug in the current code > and should be a separate patch, no? > > > case 16: > > c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : > > yuv2plane1_16LE_vsx; > > +c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : > > yuv2planeX_16LE_vsx; > > break; > > -#endif > > +#endif /* __POWER8_VECTOR__ */ These mails do tend to get long with so many bench results, but that was covered: > The existing VSX yuv2plane1 is also ifdefed out for POWER7, even though it > works there. > This is for cleanliness mainly, separating the macros would be a bit > uglier. If we have POWER7 users who need that one, please speak up. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v2] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. yuv420p9le 12341 UNITS in planarX, 130976 runs, 96 skips 73752 UNITS in planarX, 131066 runs, 6 skips yuv420p9be 12364 UNITS in planarX, 131025 runs, 47 skips 73001 UNITS in planarX, 131055 runs, 17 skips yuv420p10le 12386 UNITS in planarX, 131042 runs, 30 skips 72735 UNITS in planarX, 131062 runs, 10 skips yuv420p10be 12337 UNITS in planarX, 131045 runs, 27 skips 72734 UNITS in planarX, 131057 runs, 15 skips yuv420p12le 12236 UNITS in planarX, 131058 runs, 14 skips 73029 UNITS in planarX, 131062 runs, 10 skips yuv420p12be 12218 UNITS in planarX, 130973 runs, 99 skips 72402 UNITS in planarX, 131069 runs, 3 skips yuv420p14le 12168 UNITS in planarX, 131067 runs, 5 skips 72480 UNITS in planarX, 131069 runs, 3 skips yuv420p14be 12358 UNITS in planarX, 130948 runs,124 skips 73772 UNITS in planarX, 131063 runs, 9 skips yuv420p16le 10439 UNITS in planarX, 130911 runs,161 skips 157923 UNITS in planarX, 131068 runs, 4 skips yuv420p16be 10463 UNITS in planarX, 130874 runs,198 skips 154405 UNITS in planarX, 131061 runs, 11 skips Signed-off-by: Lauri Kasanen --- v2: Separate macros so that yuv2plane1_16_vsx remains available for power7 libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 190 +- 2 files changed, 189 insertions(+), 5 deletions(-) diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..1fd392e 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_t swap = big_endian ? 8 : 0; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift}; +const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap}; +const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip}; +const vector int16_t vzero = vec_splat_s16(0); +const vector uint8_t vperm = (vector uint8_t
[FFmpeg-devel] Video codec design for very low-end decoder
Hi, If you were to design a video codec for a very low-end decoder, what would it look like? My target is MIPS 100MHz, and it should decode 320x240x30 in full speed in software, with headroom for audio too. Seems all the codec research in last 20 years has been more quality with more overhead, nobody looking into "improve quality without more overhead". Currently I'm thinking it would have to be a variant of vector quantization, like Cinepak. The target bitrates however are ~250 kbps or lower, where Cinepak targeted 1200 or higher. Are there any tricks that would improve quality with only encoder-side effort? What is the current top-of-the-line interframe prediction, that is still fast to decode? The platform is fast enough to play back mpeg1, and xvid simple profile L3 barely. Cinepak should also work, but I'd like the quality to be higher than these three. The last relevant VQ paper I found was https://arxiv.org/abs/1710.05311 which used a genetic algorithm to seed the codebook generation, improving PSNR by a few db over previous approaches. I've implemented that (for a single grayscale frame), but it looks too bad at reasonable bitrates. The modern approaches, DCT, FFT, wavelets and such transforms, are all likely too slow to decode. Not sure if this would be better off on other MLs, didn't seem to apply to ffmpeg-user really. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] Video codec design for very low-end decoder
On Mon, 7 Jan 2019 13:44:56 +0100 Michael Niedermayer wrote: > > The modern approaches, DCT, FFT, wavelets and such transforms, are all > > likely too slow to decode. > > you said it can do mpeg1 and xvid, these are DCT based > have you tried H.264 ? (i imagine that might with asm optimizations > and avoidance of more complex features like CABAC and the loop filter > work maybe, maybe not) > also if h.264 with everything disabled works maybe some features can > be turned on sometimes like the loop filter for key frames, that > might then help compression ... > > and beating an existing codec, while certainly possible might be hard According to a 2010 comparison https://keyj.emphy.de/video-encoder-comparison/ x264 constrained baseline (everything off) takes something like 30% longer to decode vs xvid at the same rate. Probably more because that site used xvid's full features, while I used it "everything off". The issue with xvid simple and mpeg1 were that they were slightly too slow, and looked too bad. The platform does not have any SIMD, so I doubt asm optimizations will help much. Cinepak is almost 30 years old, surely it should be possible to match the decoding & quality, but at a 5x lower bitrate :P - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] Video codec design for very low-end decoder
On Mon, 7 Jan 2019 17:42:58 +0100 Michael Niedermayer wrote: > > According to a 2010 comparison > > https://keyj.emphy.de/video-encoder-comparison/ > > x264 constrained baseline (everything off) takes something like 30% > > longer to decode vs xvid at the same rate. Probably more because that > > site used xvid's full features, while I used it "everything off". > > constrained baseline is not "everything off" Wikipedia's table shows CBP as "all off", but perhaps it doesn't list every option. It lists CABAC etc, but not deblocking. Do you think the unlisted options could account for 30%? > > The issue with xvid simple and mpeg1 were that they were slightly too > > slow, and looked too bad. The platform does not have any SIMD, so I > > doubt asm optimizations will help much. > > I would guess that with rare or odd architectures > compilers are not so good when it comes to generating efficient code. > > I would not be surprised if someone who knows the target CPUs pipeline > and timings could beat the compiler by quite some amount. > This is one part where the amount of man hours needed is significant > of course. Would that be worth it, well its your project you have > to know what amount of work you are willing to do for this, > i wouldnt do that work ;) > > besides, why this low end chip ? Just for fun ;) MIPS does not have any timings, all instructions complete in the same amount of cycles (except floating point, cache misses, interrupts, etc). This makes it fairly suitable for a compiler I think, limiting what could be gotten from hand-writing asm. On Mon, 7 Jan 2019 12:02:58 -0500 "Ronald S. Bultje" wrote: > Have you considered vp8? It may sound weird but this is basically what vp8 > was great at: being really simple to decode. VP8 has a reputation of being slow, so I didn't consider it. Benchmarks show it as decoding slower than h264. Perhaps it too has features that can be disabled? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
The existing code was in no released kernel that I can see. The corrected code was added in 3.9. Signed-off-by: Lauri Kasanen --- libavutil/ppc/cpu.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libavutil/ppc/cpu.c b/libavutil/ppc/cpu.c index 7bb7cd8..b022149 100644 --- a/libavutil/ppc/cpu.c +++ b/libavutil/ppc/cpu.c @@ -93,13 +93,13 @@ int ff_get_cpu_flags_ppc(void) if (buf[i + 1] & PPC_FEATURE_HAS_VSX) ret |= AV_CPU_FLAG_VSX; #endif -#ifdef PPC_FEATURE_ARCH_2_07 -if (buf[i + 1] & PPC_FEATURE_HAS_POWER8) -ret |= AV_CPU_FLAG_POWER8; -#endif if (ret & AV_CPU_FLAG_VSX) av_assert0(ret & AV_CPU_FLAG_ALTIVEC); -goto out; +} else if (buf[i] == AT_HWCAP2) { +#ifdef PPC_FEATURE2_ARCH_2_07 +if (buf[i + 1] & PPC_FEATURE2_ARCH_2_07) +ret |= AV_CPU_FLAG_POWER8; +#endif } } } -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v3] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. yuv420p9le 12341 UNITS in planarX, 130976 runs, 96 skips 73752 UNITS in planarX, 131066 runs, 6 skips yuv420p9be 12364 UNITS in planarX, 131025 runs, 47 skips 73001 UNITS in planarX, 131055 runs, 17 skips yuv420p10le 12386 UNITS in planarX, 131042 runs, 30 skips 72735 UNITS in planarX, 131062 runs, 10 skips yuv420p10be 12337 UNITS in planarX, 131045 runs, 27 skips 72734 UNITS in planarX, 131057 runs, 15 skips yuv420p12le 12236 UNITS in planarX, 131058 runs, 14 skips 73029 UNITS in planarX, 131062 runs, 10 skips yuv420p12be 12218 UNITS in planarX, 130973 runs, 99 skips 72402 UNITS in planarX, 131069 runs, 3 skips yuv420p14le 12168 UNITS in planarX, 131067 runs, 5 skips 72480 UNITS in planarX, 131069 runs, 3 skips yuv420p14be 12358 UNITS in planarX, 130948 runs,124 skips 73772 UNITS in planarX, 131063 runs, 9 skips yuv420p16le 10439 UNITS in planarX, 130911 runs,161 skips 157923 UNITS in planarX, 131068 runs, 4 skips yuv420p16be 10463 UNITS in planarX, 130874 runs,198 skips 154405 UNITS in planarX, 131061 runs, 11 skips Signed-off-by: Lauri Kasanen --- v2: Separate macros so that yuv2plane1_16_vsx remains available for power7 v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at least power8, meaning with the current setup such a binary wouldn't run on POWER7. However using the configure define lets it be disabled in configure like Michael pointed out, and having the runtime check doesn't hurt any (it allows for future splits like on x86, where one binary can run on low cpu but use higher ISA if available). libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 195 +- 2 files changed, 193 insertions(+), 6 deletions(-) diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..77680f8 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_
Re: [FFmpeg-devel] Armada 370 problem causes ffmpeg segmentation fault
On Tue, 08 Jan 2019 21:32:30 + Simon Nash wrote: > I have encountered a problem with ffmpeg (a segmentation fault) that > occurs only when running ffmpeg on the Marvell Armada 370 processor. ... > When the 32-bit floating-point multiply instruction > 0x0018a8f2 : vmla.f32s12, s15, s15 > at activate+1690 is executed, there is a segmentation fault. You don't want to go whack-a-mole on this, since there could be 1500 other places in just ffmpeg that could hit this. You want to fix this in your compiler, it already has similar errata workarounds for almost every processor. Then every such case will work automatically. So, 1) Find the errata from the processor manufacturer 2) Report bug with that to gcc/clang/whatever compiler you use If there is no known errata for this, and you managed to find a new one, contact the processor manufacturer. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
On Wed, 9 Jan 2019 21:55:30 +0100 Carl Eugen Hoyos wrote: > 2019-01-08 10:08 GMT+01:00, Lauri Kasanen : > > The existing code was in no released kernel that I can see. The corrected > > code > > was added in 3.9. > > > > Signed-off-by: Lauri Kasanen > > --- > > libavutil/ppc/cpu.c | 10 +- > > 1 file changed, 5 insertions(+), 5 deletions(-) > > > > diff --git a/libavutil/ppc/cpu.c b/libavutil/ppc/cpu.c > > index 7bb7cd8..b022149 100644 > > --- a/libavutil/ppc/cpu.c > > +++ b/libavutil/ppc/cpu.c > > @@ -93,13 +93,13 @@ int ff_get_cpu_flags_ppc(void) > > if (buf[i + 1] & PPC_FEATURE_HAS_VSX) > > ret |= AV_CPU_FLAG_VSX; > > #endif > > -#ifdef PPC_FEATURE_ARCH_2_07 > > -if (buf[i + 1] & PPC_FEATURE_HAS_POWER8) > > -ret |= AV_CPU_FLAG_POWER8; > > -#endif > > if (ret & AV_CPU_FLAG_VSX) > > av_assert0(ret & AV_CPU_FLAG_ALTIVEC); > > > -goto out; > > This seems like an unrelated change. It's necessary. HWCAP appears before HWCAP2 in the array, so if the code jumps out in HWCAP, it never gets to checking the CAP2 bits like power8. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v3] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Wed, 9 Jan 2019 22:26:25 +0100 Carl Eugen Hoyos wrote: > > +#ifdef __GNUC__ > > +// GCC does not support vmuluwm yet. Bug open. > > +__asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l), > > "v"(vfilter[j])); > > +vleft = vec_add(vleft, vtmp); > > +__asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r), > > "v"(vfilter[j])); > > +vright = vec_add(vright, vtmp); > > +#else > > +// No idea which compilers this works in, untested. Copied from > > libsimdpp > > +vtmp = vec_vmuluwm(vin32l, vfilter[j]); > > +vleft = vec_add(vleft, vtmp); > > +vtmp = vec_vmuluwm(vin32r, vfilter[j]); > > +vright = vec_add(vright, vtmp); > > +#endif > > Is there no xlc installed on your test system? > I suspect an earlier patch from you already > broke xlc compilation... No, I don't really care about proprietary compilers. You reported previously that xlc created invalid code anyway? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. yuv420p9le 12341 UNITS in planarX, 130976 runs, 96 skips 73752 UNITS in planarX, 131066 runs, 6 skips yuv420p9be 12364 UNITS in planarX, 131025 runs, 47 skips 73001 UNITS in planarX, 131055 runs, 17 skips yuv420p10le 12386 UNITS in planarX, 131042 runs, 30 skips 72735 UNITS in planarX, 131062 runs, 10 skips yuv420p10be 12337 UNITS in planarX, 131045 runs, 27 skips 72734 UNITS in planarX, 131057 runs, 15 skips yuv420p12le 12236 UNITS in planarX, 131058 runs, 14 skips 73029 UNITS in planarX, 131062 runs, 10 skips yuv420p12be 12218 UNITS in planarX, 130973 runs, 99 skips 72402 UNITS in planarX, 131069 runs, 3 skips yuv420p14le 12168 UNITS in planarX, 131067 runs, 5 skips 72480 UNITS in planarX, 131069 runs, 3 skips yuv420p14be 12358 UNITS in planarX, 130948 runs,124 skips 73772 UNITS in planarX, 131063 runs, 9 skips yuv420p16le 10439 UNITS in planarX, 130911 runs,161 skips 157923 UNITS in planarX, 131068 runs, 4 skips yuv420p16be 10463 UNITS in planarX, 130874 runs,198 skips 154405 UNITS in planarX, 131061 runs, 11 skips Signed-off-by: Lauri Kasanen --- v2: Separate macros so that yuv2plane1_16_vsx remains available for power7 v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check v4: #if HAVE_POWER8 libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 195 +- 2 files changed, 193 insertions(+), 6 deletions(-) diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..12effe2 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_t swap = big_endian ? 8 : 0; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift}; +const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap}; +const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip,
Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
On Thu, 10 Jan 2019 18:09:21 +0100 Carl Eugen Hoyos wrote: > >> > -goto out; > >> > >> This seems like an unrelated change. > > > > It's necessary. HWCAP appears before HWCAP2 in the array, so if the > > code jumps out in HWCAP, it never gets to checking the CAP2 bits like > > power8. > > The next line (that I unfortunately cut) is: > } else if (buf[i] == AT_HWCAP2) { > indicating afaict that it is only reached if buf[i] is not equal > to HWCAP. > What do I miss? The surrounding context is a loop over all bytes: for (i = 0; i < count / sizeof(*buf); i += 2) { While the out: label is after the loop. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Fri, 11 Jan 2019 09:56:15 +0100 Michael Niedermayer wrote: > > +#ifdef __GNUC__ > > +// GCC does not support vmuluwm yet. Bug open. > > this should probably be tested by configure similar to how other > compiler limitations are tested We can't really test for it, because there is no standard name for it. I don't know what name the gcc devs will pick for it, it could be vec_mul, vec_vmuluwm or something different. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Sat, 12 Jan 2019 01:03:09 +0100 Michael Niedermayer wrote: > On Fri, Jan 11, 2019 at 11:16:20AM +0200, Lauri Kasanen wrote: > > On Fri, 11 Jan 2019 09:56:15 +0100 > > Michael Niedermayer wrote: > > > > > > +#ifdef __GNUC__ > > > > +// GCC does not support vmuluwm yet. Bug open. > > > > > > this should probably be tested by configure similar to how other > > > compiler limitations are tested > > > > We can't really test for it, because there is no standard name for it. I > > don't know what name the gcc devs will pick for it, it could be vec_mul, > > vec_vmuluwm or something different. > > the code contains a #if and a #else case > so i thought there was something else than the __GNUC__ case and gcc > would follow that It's second-hand info from libsimdpp. I don't know where they got it. However, I found out yesterday that gcc docs are wrong, and vec_mul for gcc does use the correct instruction on power8. Respinning. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v5] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. yuv420p9le 12341 UNITS in planarX, 130976 runs, 96 skips 73752 UNITS in planarX, 131066 runs, 6 skips yuv420p9be 12364 UNITS in planarX, 131025 runs, 47 skips 73001 UNITS in planarX, 131055 runs, 17 skips yuv420p10le 12386 UNITS in planarX, 131042 runs, 30 skips 72735 UNITS in planarX, 131062 runs, 10 skips yuv420p10be 12337 UNITS in planarX, 131045 runs, 27 skips 72734 UNITS in planarX, 131057 runs, 15 skips yuv420p12le 12236 UNITS in planarX, 131058 runs, 14 skips 73029 UNITS in planarX, 131062 runs, 10 skips yuv420p12be 12218 UNITS in planarX, 130973 runs, 99 skips 72402 UNITS in planarX, 131069 runs, 3 skips yuv420p14le 12168 UNITS in planarX, 131067 runs, 5 skips 72480 UNITS in planarX, 131069 runs, 3 skips yuv420p14be 12358 UNITS in planarX, 130948 runs,124 skips 73772 UNITS in planarX, 131063 runs, 9 skips yuv420p16le 10439 UNITS in planarX, 130911 runs,161 skips 157923 UNITS in planarX, 131068 runs, 4 skips yuv420p16be 10463 UNITS in planarX, 130874 runs,198 skips 154405 UNITS in planarX, 131061 runs, 11 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 186 +- 2 files changed, 184 insertions(+), 6 deletions(-) v2: Separate macros so that yuv2plane1_16_vsx remains available for power7 v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check v4: #if HAVE_POWER8 v5: Get rid of the mul #if, turns out gcc vec_mul works diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..f6c7f1d 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_t swap = big_endian ? 8 : 0; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift}; +const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap}; +const vector uint16_t vlargest = (vector uint16_
Re: [FFmpeg-devel] [PATCH v5] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Sat, 12 Jan 2019 14:52:07 +0100 Michael Niedermayer wrote: > On Sat, Jan 12, 2019 at 10:47:50AM +0200, Lauri Kasanen wrote: > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > > yuv420p16be \ > > -s 1920x1728 -f null -vframes 100 -v error -nostats - > > > > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. > > Fate passes, each format tested with an image to video conversion. > > > > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out > > of the 16-bit function. This includes the vec_mulo/mule functions too, > > not just vmuluwm. > > > > yuv420p9le > > 12341 UNITS in planarX, 130976 runs, 96 skips > > 73752 UNITS in planarX, 131066 runs, 6 skips > > yuv420p9be > > 12364 UNITS in planarX, 131025 runs, 47 skips > > 73001 UNITS in planarX, 131055 runs, 17 skips > > yuv420p10le > > 12386 UNITS in planarX, 131042 runs, 30 skips > > 72735 UNITS in planarX, 131062 runs, 10 skips > > yuv420p10be > > 12337 UNITS in planarX, 131045 runs, 27 skips > > 72734 UNITS in planarX, 131057 runs, 15 skips > > yuv420p12le > > 12236 UNITS in planarX, 131058 runs, 14 skips > > 73029 UNITS in planarX, 131062 runs, 10 skips > > yuv420p12be > > 12218 UNITS in planarX, 130973 runs, 99 skips > > 72402 UNITS in planarX, 131069 runs, 3 skips > > yuv420p14le > > 12168 UNITS in planarX, 131067 runs, 5 skips > > 72480 UNITS in planarX, 131069 runs, 3 skips > > yuv420p14be > > 12358 UNITS in planarX, 130948 runs,124 skips > > 73772 UNITS in planarX, 131063 runs, 9 skips > > yuv420p16le > > 10439 UNITS in planarX, 130911 runs,161 skips > > 157923 UNITS in planarX, 131068 runs, 4 skips > > yuv420p16be > > 10463 UNITS in planarX, 130874 runs,198 skips > > 154405 UNITS in planarX, 131061 runs, 11 skips > > The number of skips in the benchmark is much larger on one > side. That way the numbers become hard to compare as > more cases aer skipped on one side > > please adjust the parameters so the skip counts are compareable > or redo the tests until the numbers are more similar > thanks How do I do that? It's a VM, so there are going to be pauses no matter what, when other VMs run. Or should I take the largest run count with about the same skips? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p16be \ -s 1920x1728 -f null -vframes 100 -v error -nostats - 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. Fate passes, each format tested with an image to video conversion. Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out of the 16-bit function. This includes the vec_mulo/mule functions too, not just vmuluwm. With TIMER_REPORT skips disabled: yuv420p9le 12412 UNITS in planarX, 131072 runs, 0 skips 73136 UNITS in planarX, 131072 runs, 0 skips yuv420p9be 12481 UNITS in planarX, 131072 runs, 0 skips 73410 UNITS in planarX, 131072 runs, 0 skips yuv420p10le 12322 UNITS in planarX, 131072 runs, 0 skips 72546 UNITS in planarX, 131072 runs, 0 skips yuv420p10be 12291 UNITS in planarX, 131072 runs, 0 skips 72935 UNITS in planarX, 131072 runs, 0 skips yuv420p12le 12316 UNITS in planarX, 131072 runs, 0 skips 72708 UNITS in planarX, 131072 runs, 0 skips yuv420p12be 12319 UNITS in planarX, 131072 runs, 0 skips 72577 UNITS in planarX, 131072 runs, 0 skips yuv420p14le 12259 UNITS in planarX, 131072 runs, 0 skips 72516 UNITS in planarX, 131072 runs, 0 skips yuv420p14be 12440 UNITS in planarX, 131072 runs, 0 skips 72962 UNITS in planarX, 131072 runs, 0 skips yuv420p16le 10548 UNITS in planarX, 131072 runs, 0 skips 73429 UNITS in planarX, 131072 runs, 0 skips yuv420p16be 10634 UNITS in planarX, 131072 runs, 0 skips 150959 UNITS in planarX, 131072 runs, 0 skips Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_ppc_template.c | 4 +- libswscale/ppc/swscale_vsx.c | 186 +- 2 files changed, 184 insertions(+), 6 deletions(-) v6: No patch changes, updated bench numbers without skips. diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 00e4b99..11decab 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -21,7 +21,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize, +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int filterSize, yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0); for (i = dst_u; i < dstW - 15; i += 16) -FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither, +FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither, offset, i); yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i); diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index 70da6ae..f6c7f1d 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -83,6 +83,8 @@ #include "swscale_ppc_template.c" #undef FUNC +#undef vzero + #endif /* !HAVE_BIGENDIAN */ static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW, @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW, yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i); } +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits, int start) +{ +int i; +int shift = 11 + 16 - output_bits; + +for (i = start; i < dstW; i++) { +int val = 1 << (shift - 1); +int j; + +for (j = 0; j < filterSize; j++) +val += src[j][i] * filter[j]; + +output_pixel(&dest[i], val); +} +} + +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize, +const int16_t **src, uint16_t *dest, int dstW, +int big_endian, int output_bits) +{ +const int dst_u = -(uintptr_t)dest & 7; +const int shift = 11 + 16 - output_bits; +const int add = (1 << (shift - 1)); +const int clip = (1 << output_bits) - 1; +const uint16_t swap = big_endian ? 8 : 0; +const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; +const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift}; +const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap}; +const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip}; +const vector int16_t vzero = vec_splat_s16(0); +const vector uint8_t vperm
Re: [FFmpeg-devel] Video codec design for very low-end decoder
On Mon, 7 Jan 2019 12:37:01 -0500 "Ronald S. Bultje" wrote: > On Mon, Jan 7, 2019 at 12:22 PM Lauri Kasanen wrote: > > "Ronald S. Bultje" wrote: > > > > > Have you considered vp8? It may sound weird but this is basically what > > > vp8 was great at: being really simple to decode. > > > > VP8 has a reputation of being slow, so I didn't consider it. Benchmarks > > show it as decoding slower than h264. > > It is faster than h264 when comparing ffh264 vs. ffvp8 I tried VP8 on the target platform (libvpx 1.7.0). It took 32% longer to decode the test vid than xvid, and given xvid was already a bit under realtime, VP8 is out. Curiously, VP8 also added very objectionable artifacts. Some blocks *moved* around in frames. That looked very bad, neither xvid nor h264 caused that, they were just blocky or blurry. VP8 also looked worst of the three, by eye. x264 "everything disabled AFAICT" actually looks very good for the bitrate. Too bad I can't use H.264 due to the patent situation, so not going to spend time benching it either. Settings used: vpxenc -p 2 --profile=3 --target-bitrate=250 --best --end-usage=vbr --codec=vp8 --min-q=0 --max-q=60 --ivf mencoder -ovc x264 -x264encopts preset=veryslow:pass=2:bitrate=250:tune=fastdecode:profile=baseline (tune=fastdecode disables deblocking, the result file confirms all heavy options are off) - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
On Tue, 8 Jan 2019 11:08:04 +0200 Lauri Kasanen wrote: > The existing code was in no released kernel that I can see. The corrected code > was added in 3.9. > > Signed-off-by: Lauri Kasanen > --- > libavutil/ppc/cpu.c | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) Ping. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Mon, 14 Jan 2019 16:13:52 +0100 Michael Niedermayer wrote: > On Sun, Jan 13, 2019 at 10:26:20AM +0200, Lauri Kasanen wrote: > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > > yuv420p16be \ > > -s 1920x1728 -f null -vframes 100 -v error -nostats - > > > > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. > > Fate passes, each format tested with an image to video conversion. > > > > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out > > of the 16-bit function. This includes the vec_mulo/mule functions too, > > not just vmuluwm. ... > > v6: No patch changes, updated bench numbers without skips. > > fate does not get worse from this patch on qemu ppc32be and ppc64le Ping - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
On Thu, 17 Jan 2019 09:40:09 +0200 Lauri Kasanen wrote: > On Tue, 8 Jan 2019 11:08:04 +0200 > Lauri Kasanen wrote: > > > The existing code was in no released kernel that I can see. The corrected > > code > > was added in 3.9. > > > > Signed-off-by: Lauri Kasanen > > --- > > libavutil/ppc/cpu.c | 10 +- > > 1 file changed, 5 insertions(+), 5 deletions(-) > > Ping. Ping. Carl Eugen, you were the only one who looked at it - could you apply it? Given the low interest in power patches, should I be applying for commit rights? - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] MAINTAINERS: add myself to the PPC section
Signed-off-by: Lauri Kasanen --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) Ref http://ffmpeg.org/pipermail/ffmpeg-devel/2019-January/239357.html Requesting commit access so I don't have to constantly bug Michael. diff --git a/MAINTAINERS b/MAINTAINERS index bc2ae13..e3a80e9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -526,6 +526,7 @@ Alpha Falk Hueffner MIPSManojkumar Bhosale, Shiyou Yin Mac OS X / PowerPC Romain Dolbeau, Guillaume Poirier Amiga / PowerPC Colin Ward +Linux / PowerPC Lauri Kasanen Windows MinGW Alex Beregszaszi, Ramiro Polla Windows Cygwin Victor Paesa Windows MSVCMatthew Oliver, Hendrik Leppkes -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection
On Tue, 8 Jan 2019 11:08:04 +0200 Lauri Kasanen wrote: > The existing code was in no released kernel that I can see. The corrected code > was added in 3.9. > > Signed-off-by: Lauri Kasanen > --- > libavutil/ppc/cpu.c | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX
On Sun, 13 Jan 2019 10:26:20 +0200 Lauri Kasanen wrote: > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt > yuv420p16be \ > -s 1920x1728 -f null -vframes 100 -v error -nostats - > > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x. > Fate passes, each format tested with an image to video conversion. > > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out > of the 16-bit function. This includes the vec_mulo/mule functions too, > not just vmuluwm. Applying. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 2/2] avcodec/pnm: Avoid structure pointer dereferences in inner loop in pnm_get()
On Thu, 21 Feb 2019 20:34:29 +0100 Michael Niedermayer wrote: > Improves speed from 5.4 to 4.2 seconds > Fixes: > 13149/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_PGM_fuzzer-5760833622114304 LGTM Though, I really would expect the compiler to detect and optimize that. I wonder if "PNMContext * const sc" would help it any. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH] avcodec/tiff: Add support for recognizing DNG files
On Mon, 18 Mar 2019 09:13:01 +0100 Moritz Barsnick wrote: > On Sun, Mar 17, 2019 at 23:05:01 +0100, Paul B Mahol wrote: > > Still wrong, You can decode images you linked just fine (albeit with > > incorrect colors) with command: > > > > ffmpeg -subimage 1 -i IMAGE.dng rest of command. > > Shouldn't, ideally, these image files be demuxed as two image streams? > Perhaps with the "main" image as the first stream. The DNG spec is pretty massive, and there's a huge amount of variations. There can easily be far more than two streams, there could be several "main" images and several previews in different sizes. Their order can vary too, it's not always the thumbnail first; thumbnails can also be omitted entirely. There's also several different encodings/compression types for the "main" images. I've used their libdng for a project. It's a big LGPL library implementing pretty much everything, but no distro really ships it, so it'd have to be embedded or built manually by the user. - Lauri ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/2] swscale/ppc: Clean up some mixed decl warnings
Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_altivec.c | 6 +++--- libswscale/ppc/swscale_ppc_template.c | 9 + libswscale/ppc/swscale_vsx.c | 6 +++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c index d72ed1e..3cd9782 100644 --- a/libswscale/ppc/swscale_altivec.c +++ b/libswscale/ppc/swscale_altivec.c @@ -43,10 +43,10 @@ #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\ vector signed short ls;\ +vector signed int vf1, vf2, i1, i2;\ GET_LS(l1, x, perm, src);\ -vector signed int i1 = vec_mule(filter, ls);\ -vector signed int i2 = vec_mulo(filter, ls);\ -vector signed int vf1, vf2;\ +i1 = vec_mule(filter, ls);\ +i2 = vec_mulo(filter, ls);\ vf1 = vec_mergeh(i1, i2);\ vf2 = vec_mergel(i1, i2);\ d1 = vec_add(d1, vf1);\ diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 11decab..3964a7a 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -184,16 +184,17 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW, for (j = 0; j < filterSize - 15; j += 16) { vector unsigned char src_v1, src_vF; -vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1; +vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1, src_vA, src_vB; +vector signed int val_acc; LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF); -vector signed short src_vA = // vec_unpackh sign-extends... +src_vA = // vec_unpackh sign-extends... (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF)); -vector signed short src_vB = // vec_unpackh sign-extends... +src_vB = // vec_unpackh sign-extends... (vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF)); GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0); GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16); -vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); +val_acc = vec_msums(src_vA, filter_v0, val_v); val_v = vec_msums(src_vB, filter_v1, val_acc); UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0); } diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c index f6c7f1d..01eb46c 100644 --- a/libswscale/ppc/swscale_vsx.c +++ b/libswscale/ppc/swscale_vsx.c @@ -42,10 +42,10 @@ #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\ vector signed short ls;\ +vector signed int vf1, vf2, i1, i2;\ GET_LS(l1, x, perm, src);\ -vector signed int i1 = vec_mule(filter, ls);\ -vector signed int i2 = vec_mulo(filter, ls);\ -vector signed int vf1, vf2;\ +i1 = vec_mule(filter, ls);\ +i2 = vec_mulo(filter, ls);\ vf1 = vec_mergeh(i1, i2);\ vf2 = vec_mergel(i1, i2);\ d1 = vec_add(d1, vf1);\ -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 2/2] swscale/ppc: Add av_unused to template vars only used in one includer
Signed-off-by: Lauri Kasanen --- libswscale/ppc/swscale_ppc_template.c | 21 +++-- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/libswscale/ppc/swscale_ppc_template.c b/libswscale/ppc/swscale_ppc_template.c index 3964a7a..aff2dd7 100644 --- a/libswscale/ppc/swscale_ppc_template.c +++ b/libswscale/ppc/swscale_ppc_template.c @@ -44,7 +44,7 @@ static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize, for (j = 0; j < filterSize; j++) { unsigned int joffset=j<<1; unsigned int xoffset=x<<1; -vector unsigned char perm; +vector unsigned char av_unused perm; vector signed short l1,vLumFilter; LOAD_FILTER(vLumFilter,filter); vLumFilter = vec_splat(vLumFilter, 0); @@ -133,8 +133,8 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW, case 8: for (i = 0; i < dstW; i++) { register int srcPos = filterPos[i]; -vector unsigned char src_vF, src_v0, src_v1; -vector unsigned char permS; +vector unsigned char src_vF, av_unused src_v0, av_unused src_v1; +vector unsigned char av_unused permS; vector signed short src_v, filter_v; vector signed int val_v, val_s; FIRST_LOAD(src_v0, srcPos, src, permS); @@ -173,18 +173,19 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW, default: for (i = 0; i < dstW; i++) { -register int j, offset = i * 2 * filterSize; +register int j, av_unused offset = i * 2 * filterSize; register int srcPos = filterPos[i]; vector signed int val_s, val_v = (vector signed int)vzero; -vector signed short filter_v0R; -vector unsigned char permF, src_v0, permS; +vector signed short av_unused filter_v0R; +vector unsigned char av_unused permF, av_unused src_v0, av_unused permS; FIRST_LOAD(filter_v0R, offset, filter, permF); FIRST_LOAD(src_v0, srcPos, src, permS); for (j = 0; j < filterSize - 15; j += 16) { -vector unsigned char src_v1, src_vF; -vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1, src_vA, src_vB; +vector unsigned char av_unused src_v1, src_vF; +vector signed short av_unused filter_v1R, av_unused filter_v2R, +filter_v0, filter_v1, src_vA, src_vB; vector signed int val_acc; LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF); src_vA = // vec_unpackh sign-extends... @@ -201,8 +202,8 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, int dstW, if (j < filterSize - 7) { // loading src_v0 is useless, it's already done above -vector unsigned char src_v1, src_vF; -vector signed short src_v, filter_v1R, filter_v; +vector unsigned char av_unused src_v1, src_vF; +vector signed short src_v, av_unused filter_v1R, filter_v; LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF); src_v = // vec_unpackh sign-extends... (vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF)); -- 2.6.2 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel