Le tiistaina 13. elokuuta 2024, 17.03.36 EEST J. Dekker a écrit : > +#include "libavutil/riscv/asm.S" > + > +.macro vnclipsu.wi shifti, lmul, lmul2, vregs:vararg > + vsetvli zero, zero, e16, \lmul2, ta, ma
We don't typically do that for a very good reason. The vsetvli is most often redundant and nonobviously so. (Also in some cases, there are better ways to do signed-to-unsigned clip.) > + .irp x, \vregs > + vmax.vx \x, \x, zero > + .endr > + vsetvli zero, zero, e8, \lmul, ta, ma > + .irp x, \vregs > + vnclipu.wi \x, \x, \shifti > + .endr > +.endm > + > +.macro lowpass_init lmul, sizei, size, w0, w1, backup This is needlessly convoluted. In fact, backup is not even used, which kind of highlights the point. > + vsetivli zero, \sizei, e8, \lmul, ta, ma > + csrwi vxrm, 0 > + li \size, \sizei > + .ifnb \w0 > + li \w0, 20 > + li \w1, -5 > + .endif > +.endm > + > + /* output is unclipped; clobbers v26-v31 plus \tmp and \tmp2 */ > +.macro lowpass_h vdst, src, w0, w1, tmp=t3, tmp2=t4 > + addi \tmp, \src, 3 > + lbu \tmp2, 2(\src) > + vle8.v v31, (\tmp) > + lbu \tmp, 1(\src) > + vslide1up.vx v30, v31, \tmp2 > + lbu \tmp2, 0(\src) > + vslide1up.vx v29, v30, \tmp > + lbu \tmp, -1(\src) > + vslide1up.vx v28, v29, \tmp2 > + lbu \tmp2, -2(\src) > + vslide1up.vx v27, v28, \tmp > + vslide1up.vx v26, v27, \tmp2 That's a lot of sequentially dependent vector instructions to save zero- extending v31 before the MACs. Are you sure it's faster that way? > + vwaddu.vv \vdst, v26, v31 > + vwmaccu.vx \vdst, \w0, v28 > + vwmaccu.vx \vdst, \w0, v29 > + vwmaccsu.vx \vdst, \w1, v27 > + vwmaccsu.vx \vdst, \w1, v30 > +.endm > + > + /* output is unclipped */ > +.macro lowpass_v w0, w1, vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, > vsrc5, signed=0 > + .if \signed > + vwadd.vv \vdst, \vsrc0, \vsrc5 > + vwmacc.vx \vdst, \w0, \vsrc2 > + vwmacc.vx \vdst, \w0, \vsrc3 > + vwmacc.vx \vdst, \w1, \vsrc1 > + vwmacc.vx \vdst, \w1, \vsrc4 > + .else > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > + vwmaccu.vx \vdst, \w0, \vsrc2 > + vwmaccu.vx \vdst, \w0, \vsrc3 > + vwmaccsu.vx \vdst, \w1, \vsrc1 > + vwmaccsu.vx \vdst, \w1, \vsrc4 > + .endif > +.endm > + > +.macro qpel_mc00 op, dst, src, stride, size > +func ff_\op\()_h264_qpel_pixels, zve32x > +1: > + add t0, \stride, \src > + add t1, \stride, t0 > + add t2, \stride, t1 > + vle8.v v0, (\src) > + vle8.v v1, (t0) > + vle8.v v2, (t1) > + vle8.v v3, (t2) > + addi \size, \size, -4 > + add \src, \stride, t2 > + add t0, \stride, \dst > + add t1, \stride, t0 > + add t2, \stride, t1 > + .ifc \op, avg > + vle8.v v4, (\dst) > + vle8.v v5, (t0) > + vle8.v v6, (t1) > + vle8.v v7, (t2) > + vaaddu.vv v0, v0, v4 > + vaaddu.vv v1, v1, v5 > + vaaddu.vv v2, v2, v6 > + vaaddu.vv v3, v3, v7 > + .endif > + vse8.v v0, (\dst) > + vse8.v v1, (t0) > + vse8.v v2, (t1) > + vse8.v v3, (t2) > + add \dst, \stride, t2 > + bnez \size, 1b > + ret > +endfunc > +.endm > + > + qpel_mc00 put, a0, a1, a2, a4 > + qpel_mc00 avg, a0, a1, a2, a4 Please don't add constant macro parameters. > + > +.macro qpel_lowpass op, ext, lmul, lmul2, dst, src, dst_stride, > src_stride, size, w0, w1, src2, src2_stride > +func > ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + lowpass_h v0, \src, \w0, \w1 > + lowpass_h v2, t0, \w0, \w1 > + lowpass_h v4, t1, \w0, \w1 > + lowpass_h v6, t2, \w0, \w1 > + add \src, \src_stride, t2 > + addi \size, \size, -4 > + vnclipsu.wi 5, \lmul, \lmul2, v0, v2, v4, v6 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v8, (\src2) > + vle8.v v10, (t0) > + vle8.v v12, (t1) > + vle8.v v14, (t2) > + add \src2, \dst_stride, t2 > + vaaddu.vv v0, v0, v8 > + vaaddu.vv v2, v2, v10 > + vaaddu.vv v4, v4, v12 > + vaaddu.vv v6, v6, v14 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v1, (\dst) > + vle8.v v3, (t0) > + vle8.v v5, (t1) > + vle8.v v7, (t2) > + vaaddu.vv v0, v0, v1 > + vaaddu.vv v2, v2, v3 > + vaaddu.vv v4, v4, v5 > + vaaddu.vv v6, v6, v7 > + .endif > + vse8.v v0, (\dst) > + vse8.v v2, (t0) > + vse8.v v4, (t1) > + vse8.v v6, (t2) > + add \dst, \dst_stride, t2 > + bnez \size, 1b > + ret > +endfunc > + > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > + sub t0, \src, \src_stride > + sub t1, t0, \src_stride > + vle8.v v2, (\src) > + vle8.v v1, (t0) > + vle8.v v0, (t1) > + add t0, \src, \src_stride > + add t1, t0, \src_stride > + add \src, t1, \src_stride > + vle8.v v3, (t0) > + vle8.v v4, (t1) > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + vle8.v v5, (\src) > + vle8.v v6, (t0) > + vle8.v v7, (t1) > + vle8.v v8, (t2) > + add \src, \src_stride, t2 > + lowpass_v \w0, \w1, v24, v0, v1, v2, v3, v4, v5 > + lowpass_v \w0, \w1, v26, v1, v2, v3, v4, v5, v6 > + lowpass_v \w0, \w1, v28, v2, v3, v4, v5, v6, v7 > + lowpass_v \w0, \w1, v30, v3, v4, v5, v6, v7, v8 > + addi \size, \size, -4 > + vnclipsu.wi 5, \lmul, \lmul2, v24, v26, v28, v30 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v9, (\src2) > + vle8.v v10, (t0) > + vle8.v v11, (t1) > + vle8.v v12, (t2) > + add \src2, \src2_stride, t2 > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v9, (\dst) > + vle8.v v10, (t0) > + vle8.v v11, (t1) > + vle8.v v12, (t2) > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + vse8.v v24, (\dst) > + vse8.v v26, (t0) > + vse8.v v28, (t1) > + vse8.v v30, (t2) > + add \dst, \dst_stride, t2 > + vmv.v.v v0, v4 > + vmv.v.v v1, v5 > + vmv.v.v v2, v6 > + vmv.v.v v3, v7 > + vmv.v.v v4, v8 At this point, any vector move without rationale is an automatic -1 from me. > + bnez \size, 1b > + ret > +endfunc > + > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > + sub t0, \src, \src_stride > + sub t1, t0, \src_stride > + lowpass_h v4, \src, \w0, \w1 > + lowpass_h v2, t0, \w0, \w1 > + lowpass_h v0, t1, \w0, \w1 > + add t0, \src, \src_stride > + add t1, t0, \src_stride > + add \src, t1, \src_stride > + lowpass_h v6, t0, \w0, \w1 > + lowpass_h v8, t1, \w0, \w1 > +1: > + add t0, \src_stride, \src > + add t1, \src_stride, t0 > + add t2, \src_stride, t1 > + lowpass_h v10, \src, \w0, \w1 > + lowpass_h v12, t0, \w0, \w1 > + lowpass_h v14, t1, \w0, \w1 > + lowpass_h v16, t2, \w0, \w1 > + vsetvli zero, zero, e16, \lmul2, ta, ma > + addi \size, \size, -4 > + lowpass_v \w0, \w1, v20, v0, v2, v4, v6, v8, v10, > signed=1 > + lowpass_v \w0, \w1, v24, v2, v4, v6, v8, v10, > v12, signed=1 > + lowpass_v \w0, \w1, v28, v4, v6, v8, v10, > v12, v14, signed=1 > + vnclip.wi v0, v20, 10 > + lowpass_v \w0, \w1, v20, v6, v8, v10, v12, v14, v16, > signed= > + vnclip.wi v2, v24, 10 > + vnclip.wi v4, v28, 10 > + vnclip.wi v6, v20, 10 > + vmax.vx v18, v0, zero > + vmax.vx v20, v2, zero > + vmax.vx v22, v4, zero > + vmax.vx v24, v6, zero > + vmv.v.v v0, v8 > + vmv.v.v v2, v10 > + vmv.v.v v4, v12 > + vmv.v.v v6, v14 > + vmv.v.v v8, v16 > + add \src, \src_stride, t2 > + vsetvli zero, zero, e8, \lmul, ta, ma > + vnclipu.wi v18, v18, 0 > + vnclipu.wi v20, v20, 0 > + vnclipu.wi v22, v22, 0 > + vnclipu.wi v24, v24, 0 > + .ifnb \src2 > + add t0, \src2_stride, \src2 > + add t1, \src2_stride, t0 > + add t2, \src2_stride, t1 > + vle8.v v26, (\src2) > + vle8.v v27, (t0) > + vle8.v v28, (t1) > + vle8.v v29, (t2) > + add \src2, \src2_stride, t2 > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + add t0, \dst_stride, \dst > + add t1, \dst_stride, t0 > + add t2, \dst_stride, t1 > + .ifc \op, avg > + vle8.v v26, (\dst) > + vle8.v v27, (t0) > + vle8.v v28, (t1) > + vle8.v v29, (t2) > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + vse8.v v18, (\dst) > + vse8.v v20, (t0) > + vse8.v v22, (t1) > + vse8.v v24, (t2) > + add \dst, \dst_stride, t2 > + bnez \size, 1b > + ret > +endfunc > +.endm > + > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > + loading 32 bit integers, but this makes the convolutions more > complicated + to implement, so it's not necessarily any faster. */ > + > +.macro h264_qpel lmul, lmul2 > + qpel_lowpass put, , \lmul, \lmul2, a0, a1, a2, a3, a4, t5, > t6 > + qpel_lowpass put, _l2, \lmul, \lmul2, a0, a1, a2, a3, a4, > t5, t6, a5, a6 > + qpel_lowpass avg, , \lmul, \lmul2, a0, a1, > a2, a3, a4, t5, t6 > + qpel_lowpass avg, _l2, \lmul, \lmul2, a0, > a1, a2, a3, a4, t5, t6, a5, a6 > +.endm > + > + h264_qpel m1, m2 > + h264_qpel mf2, m1 > + h264_qpel mf4, mf2 > + h264_qpel mf8, mf4 > + > +.macro ff_h264_qpel_fns op, lmul, sizei, ext=rvv, dst, src, dst_stride, > src_stride, size, w0, w1, src2, src2_stride, tmp > +func > ff_\op\()_h264_qpel\sizei\()_mc00_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, > + j ff_\op\()_h264_qpel_pixels > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc10_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + mv \src2, \src > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc20_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\() > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc30_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + addi \src2, \src, 1 > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_h_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc01_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + mv \src2, \src > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc02_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc03_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + add \src2, \src, \src_stride > + mv \src2_stride, \src_stride > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc11_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src It's all but impossible to tell if spilling is actually necessary when you alias registers like this. > + mv \tmp, ra Use t0 for subprocedure return. See specs. > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul You can use jal here > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc31_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + addi \src, \src, 1 > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc13_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc33_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + addi \src, \src, 1 > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_v_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc22_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + mv \src_stride, \dst_stride > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc21_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc23_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + add \src, \src, \src_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_h_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc12_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_v_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > + > +func ff_\op\()_h264_qpel\sizei\()_mc32_\ext, zve32x > + lowpass_init \lmul, \sizei, \size, \w0, \w1 > + push \dst, \src > + mv \tmp, ra > + addi \src, \src, 1 > + mv \src_stride, \dst_stride > + addi \dst, sp, -(\sizei * \sizei) > + li \dst_stride, \sizei > + call ff_put_h264_qpel_v_lowpass_\lmul > + addi \src2, sp, -(\sizei * \sizei) > + mv \src2_stride, \dst_stride > + pop \dst, \src > + mv \dst_stride, \src_stride > + li \size, \sizei > + mv ra, \tmp > + j ff_\op\()_h264_qpel_hv_lowpass_\lmul\()_l2 > +endfunc > +.endm > + > + ff_h264_qpel_fns put, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns put, mf4, 8, rvv256, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns put, mf8, 4, rvv256, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns avg, mf2, 16, rvv256, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns avg, mf4, 8, rvv256, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns avg, mf8, 4, rvv256, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns put, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns put, mf2, 8, rvv, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns put, mf4, 4, rvv, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 > + > + ff_h264_qpel_fns avg, m1, 16, rvv, a0, a1, a2, a3, a4, t5, t6, > a5, a6, a7 > + ff_h264_qpel_fns avg, mf2, 8, rvv, a0, a1, a2, a3, > a4, t5, t6, a5, a6, a7 > + ff_h264_qpel_fns avg, mf4, 4, rvv, a0, > a1, a2, a3, a4, t5, t6, a5, a6, a7 -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".