On Thu, Jan 11, 2018 at 9:45 PM, Martin Vignali <martin.vign...@gmail.com> wrote: > + if (check_func(c.sub_left_predict, "sub_left_predict")) { > + call_ref(dst0, src0, stride, width, height); > + call_new(dst1, src0, stride, width, height); > + if (memcmp(dst0, dst1, width)) > + fail(); > + bench_new(dst1, src0, stride, width, height); > + }
You're only verifying the results of the first row here. Changing it to test all rows results in test failures. > + int width = av_clip(rnd(), 16, 128); > + int height = av_clip(rnd(), 16, 128); This kind of clipping will result in the values being 128 almost every run. You should also use constant sizes instead of random ones because random ones will make benchmarking inconsistent since you'll measure different things for the C and asm versions. You could do something along the lines of static const struct { uint8_t w, h, s; } planes[] = { {16,16,16}, {21,23,25}, {32,17,48}, {15,128,16}, {128,127,128} }; and just test all of those every run. > +%if ARCH_X86_64 > +INIT_XMM ssse3 > +cglobal sub_left_predict, 4,5,5, dst, src, stride, width, height, x > + mova m0, [pb_15] ; shuffle for last byte > + mova m1, [pb_80] ; prev initial > +.nextrow: > + xor xq, xq > + > + .loop: > + movu m2, [srcq + xq] > + psubb m1, m2 ; - prev > + pslldq m3, m1, 1 > + psubb m3, m1 > + movu [dstq+xq], m3 > + pshufb m1, m2, m0 > + add xq, mmsize > + cmp xd, widthd > + jl .loop > + > + add srcq, strideq > + add dstq, widthq > + sub heightq, 1 > + jg .nextrow > + REP_RET > +%endif There's no need to restrict this to x86-64 only. The register specification is wrong and will fail on Windows (and 32-bit). Using a constant 15 for pshufb will be be wrong for the first byte of every row except for the first with non-mod16 widths. Try something like this: INIT_XMM avx cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x movsxdifnidn widthq, widthd ; Change width from int to ptrdiff_t to get rid of this mova m1, [pb_80] ; prev add dstq, widthq add srcq, widthq lea xd, [widthq-1] neg widthq and xd, 15 pinsrb m4, m1, xd, 15 mov xq, widthq .loop: movu m0, [srcq+widthq] palignr m2, m0, m1, 15 movu m1, [srcq+widthq+16] palignr m3, m1, m0, 15 psubb m2, m0, m2 psubb m3, m1, m3 movu [dstq+widthq], m2 movu [dstq+widthq+16], m3 add widthq, 2*16 jl .loop add srcq, strideq sub dstq, xq test xd, 16 jz .mod32 mova m1, m0 .mod32: pshufb m1, m4 mov widthq, xq dec heightd jg .loop RET _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel