On Tue, 2016-06-21 at 00:04 -0500, Dan Parrot wrote: > On Tue, 2016-06-21 at 02:22 +0200, Michael Niedermayer wrote: > > On Mon, Jun 20, 2016 at 06:38:18PM -0500, Dan Parrot wrote: > > > On Tue, 2016-06-21 at 01:06 +0200, Michael Niedermayer wrote: > > > > On Mon, Jun 20, 2016 at 05:55:47PM -0500, Dan Parrot wrote: > > > > > On Tue, 2016-06-21 at 00:45 +0200, Michael Niedermayer wrote: > > > > > > On Sun, Jun 19, 2016 at 09:57:42PM +0000, Dan Parrot wrote: > > > > > > > First commit addressing Trac ticket #5570. Functions defined in > > > > > > > libswscale/input.c > > > > > > > have corresponding SIMD definitions in libswscale/ppc/input_vsx.c > > > > > > > --- > > > > > > > libswscale/ppc/Makefile | 1 + > > > > > > > libswscale/ppc/input_vsx.c | 1070 > > > > > > > +++++++++++++++++++++++++++++++++++++++++ > > > > > > > libswscale/swscale.c | 3 + > > > > > > > libswscale/swscale_internal.h | 1 + > > > > > > > 4 files changed, 1075 insertions(+) > > > > > > > create mode 100644 libswscale/ppc/input_vsx.c > > > > > > > > > > > > > > diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile > > > > > > > index d1b596e..2482893 100644 > > > > > > > --- a/libswscale/ppc/Makefile > > > > > > > +++ b/libswscale/ppc/Makefile > > > > > > > @@ -1,3 +1,4 @@ > > > > > > > OBJS += ppc/swscale_altivec.o > > > > > > > \ > > > > > > > + ppc/input_vsx.o > > > > > > > \ > > > > > > > ppc/yuv2rgb_altivec.o > > > > > > > \ > > > > > > > ppc/yuv2yuv_altivec.o > > > > > > > \ > > > > > > > diff --git a/libswscale/ppc/input_vsx.c > > > > > > > b/libswscale/ppc/input_vsx.c > > > > > > > new file mode 100644 > > > > > > > index 0000000..adb0e38 > > > > > > > --- /dev/null > > > > > > > +++ b/libswscale/ppc/input_vsx.c > > > > > > > @@ -0,0 +1,1070 @@ > > > > > > > +/* > > > > > > > + * Copyright (C) 2016 Dan Parrot <dan.par...@mail.com> > > > > > > > + * > > > > > > > + * This file is part of FFmpeg. > > > > > > > + * > > > > > > > + * FFmpeg is free software; you can redistribute it and/or > > > > > > > + * modify it under the terms of the GNU Lesser General Public > > > > > > > + * License as published by the Free Software Foundation; either > > > > > > > + * version 2.1 of the License, or (at your option) any later > > > > > > > version. > > > > > > > + * > > > > > > > + * FFmpeg is distributed in the hope that it will be useful, > > > > > > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > > > > > > GNU > > > > > > > + * Lesser General Public License for more details. > > > > > > > + * > > > > > > > + * You should have received a copy of the GNU Lesser General > > > > > > > Public > > > > > > > + * License along with FFmpeg; if not, write to the Free Software > > > > > > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > > > > > > > 02110-1301 USA > > > > > > > + */ > > > > > > > + > > > > > > > +#include <math.h> > > > > > > > +#include <stdint.h> > > > > > > > +#include <stdio.h> > > > > > > > +#include <string.h> > > > > > > > + > > > > > > > +#include "libavutil/avutil.h" > > > > > > > +#include "libavutil/bswap.h" > > > > > > > +#include "libavutil/cpu.h" > > > > > > > +#include "libavutil/intreadwrite.h" > > > > > > > +#include "libavutil/mathematics.h" > > > > > > > +#include "libavutil/pixdesc.h" > > > > > > > +#include "libavutil/avassert.h" > > > > > > > +#include "config.h" > > > > > > > +#include "libswscale/rgb2rgb.h" > > > > > > > +#include "libswscale/swscale.h" > > > > > > > +#include "libswscale/swscale_internal.h" > > > > > > > + > > > > > > > +#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : > > > > > > > AV_RL16(pos)) > > > > > > > + > > > > > > > +#define r ((origin == AV_PIX_FMT_BGR48BE || origin == > > > > > > > AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == > > > > > > > AV_PIX_FMT_BGRA64LE) ? b_r : r_b) > > > > > > > +#define b ((origin == AV_PIX_FMT_BGR48BE || origin == > > > > > > > AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == > > > > > > > AV_PIX_FMT_BGRA64LE) ? r_b : b_r) > > > > > > > + > > > > > > > +#if HAVE_VSX > > > > > > > + > > > > > > > +// This is a SIMD version for IBM POWER8 of function > > > > > > > rgb64ToY_c_template > > > > > > > +// in file libswscale/input.c > > > > > > > +static av_always_inline void > > > > > > > +rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int > > > > > > > width, > > > > > > > + enum AVPixelFormat origin, int32_t > > > > > > > *rgb2yuv) > > > > > > > +{ > > > > > > > + int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = > > > > > > > rgb2yuv[BY_IDX]; > > > > > > > + int i, j; > > > > > > > + int num_vec, frag; > > > > > > > + > > > > > > > + num_vec = width / 8; > > > > > > > + frag = width % 8; > > > > > > > + > > > > > > > + vector int v_ry = vec_splats((int)ry); > > > > > > > + vector int v_gy = vec_splats((int)gy); > > > > > > > + vector int v_by = vec_splats((int)by); > > > > > > > + > > > > > > > + int s_opr2; > > > > > > > + s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1)); > > > > > > > + > > > > > > > + vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT); > > > > > > > + vector int v_opr2 = vec_splats((int)s_opr2); > > > > > > > + > > > > > > > + vector int v_r, v_g, v_b, v_tmp; > > > > > > > + vector short v_tmpi, v_dst; > > > > > > > + > > > > > > > + for (i = 0; i < num_vec; i++) { > > > > > > > + for (j = 7; j >= 0 ; j--) { > > > > > > > + int r_b = input_pixel(&src[(i*8+j)*4+0]); > > > > > > > + int g = input_pixel(&src[(i*8+j)*4+1]); > > > > > > > + int b_r = input_pixel(&src[(i*8+j)*4+2]); > > > > > > > + > > > > > > > + v_r[j % 4] = r; > > > > > > > + v_g[j % 4] = g; > > > > > > > + v_b[j % 4] = b; > > > > > > > + > > > > > > > + if (!(j % 4)) { > > > > > > ^ > > > > > > > > > > > > > + v_tmp = v_ry * v_r; > > > > > > > + v_tmp = v_tmp + v_gy * v_g; > > > > > > > + v_tmp = v_tmp + v_by * v_b; > > > > > > > + v_tmp = v_tmp + v_opr2; > > > > > > > + v_tmp = vec_sr(v_tmp, (vector unsigned > > > > > > > int)v_opr1); > > > > > > > + > > > > > > > + v_tmpi = (vector short)v_tmp; > > > > > > > + v_dst[(j / 4) * 4 + 3] = v_tmpi[6]; > > > > > > ^ > > > > > > what is the speed of a division and modulo on PPC compared to a > > > > > > bitwise and ? > > > > > > > > > > > > its also not trivial for the compiler to optimize then out as it > > > > > > has to proof the varables are never negative > > > > > > > > > > > > > > > > > > [...] > > > > > > _______________________________________________ > > > > > > ffmpeg-devel mailing list > > > > > > ffmpeg-devel@ffmpeg.org > > > > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > > > > > > > I don't know the answer to those questions. But, I see your point. > > > > > Bitwise operations should be faster than multiplications and > > > > > divisions. > > > > > So, I shall change the code to use bitwise ops and compare the > > > > > execution > > > > > time against its present value (well, average value over multiple > > > > > runs) > > > > > > > > it might not make a difference if the compiler does optmize them out > > > > but you should know the approximate speed of the instructions > > > > that your code is likely/potentially going to use. I mean this is > > > > code optimized for PPC. > > > > > > > > [...] > > > > _______________________________________________ > > > > ffmpeg-devel mailing list > > > > ffmpeg-devel@ffmpeg.org > > > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > > > I take exception to the tone in that last sentence and I shall respond > > > in the same spirit. > > > > > > 1. I could spend time obtaining the detailed POWER8 micro architecture > > > description and compare the execution time of each machine instruction. > > > 2. Study gcc source to find out exactly which machine instructions it > > > generates for each C language operator > > > 3. Use 1 and 2 above to determine which C operators to use here. > > > > > > OR > > > > > > I could go ahead and run 2 simulations and compare their average > > > execution times. > > > > > > Seems to me pretty clear which is a better use of time. > > > > Knowing the execution times of instructions is quite usefull > > it certainly takes alot more time to search and read that than to > > benchmark once but once you know the timings approximatly you can > > roughly guess how fast/slow some code is by just looking. > > If knowing that doesnt interrest you then please ignore my comment > > to me these things always feel interresting and it was to me certainly > > quite usefull when optmizing code to know this kind of stuff for x86 > > > > also what are you testing on ? > > > > [...] > > _______________________________________________ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > Changing the operations to use bitwise operators instead of > multiplications, divisions and modulo arithmetic did not appreciably > change execution times. The execution times of the two versions were > within 1.5 seconds of each other in all of worst, best and average times > reported by command "/usr/bin/time -p make -j 4 fate > SAMPLES=fate-suite/" > Worst case real components clustered around 310s. Best case times > clustered about 296s. > > Do you want me to submit a patch using the bitwise operators to replace > the previous patch? > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
ffmpeg compiled with the SIMD patch is faster than that built using the non-SIMD version currently in the repository. Averaging over 10 runs each of the full FATE regression suite shows the SIMD version faster by about 3.5 seconds. The output of "cat /proc/cpuinfo" on the machine gives: ============== processor : 0 cpu : POWER8E (raw), altivec supported clock : 3425.000000MHz revision : 2.1 (pvr 004b 0201) processor : 1 cpu : POWER8E (raw), altivec supported clock : 3425.000000MHz revision : 2.1 (pvr 004b 0201) processor : 2 cpu : POWER8E (raw), altivec supported clock : 3425.000000MHz revision : 2.1 (pvr 004b 0201) processor : 3 cpu : POWER8E (raw), altivec supported clock : 3425.000000MHz revision : 2.1 (pvr 004b 0201) timebase : 512000000 platform : pSeries model : IBM pSeries (emulated by qemu) machine : CHRP IBM pSeries (emulated by qemu) ============== Do you need more information before you can accept and apply the patch? I would like to move on and code up SIMD versions for the rest of the functions in libswscale/input.c. Thanks. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel