Le tiistaina 24. syyskuuta 2024, 16.59.19 EEST J. Dekker a écrit : > From: Niklas Haas <g...@haasn.dev> > > Benched on K230 for VLEN 128, SpaceMIT for VLEN 256. > > C RVV128 C RVV256 > avg_h264_qpel_4_mc00_8 33.9 33.6 (1.01x) 40.1 29.6 (1.35x) > avg_h264_qpel_4_mc01_8 218.8 89.1 (2.46x) 227.6 92.3 (2.47x) > avg_h264_qpel_4_mc02_8 218.8 79.8 (2.74x) 227.8 81.8 (2.78x) > avg_h264_qpel_4_mc03_8 218.8 89.1 (2.46x) 217.3 92.1 (2.36x) > avg_h264_qpel_4_mc10_8 172.3 126.1 (1.37x) 186.1 123.6 (1.51x) > avg_h264_qpel_4_mc11_8 339.1 190.8 (1.78x) 342.3 196.6 (1.74x) > avg_h264_qpel_4_mc12_8 533.6 357.6 (1.49x) 550.6 342.3 (1.61x) > avg_h264_qpel_4_mc13_8 348.4 190.8 (1.83x) 342.3 196.3 (1.74x) > avg_h264_qpel_4_mc20_8 144.8 116.8 (1.24x) 144.3 123.3 (1.17x) > avg_h264_qpel_4_mc21_8 478.1 385.6 (1.24x) 488.1 383.8 (1.27x) > avg_h264_qpel_4_mc22_8 348.4 283.6 (1.23x) 352.6 279.8 (1.26x) > avg_h264_qpel_4_mc23_8 478.1 394.6 (1.21x) 488.1 383.8 (1.27x) > avg_h264_qpel_4_mc30_8 172.6 126.1 (1.37x) 186.1 123.6 (1.51x) > avg_h264_qpel_4_mc31_8 339.4 191.1 (1.78x) 342.3 196.6 (1.74x) > avg_h264_qpel_4_mc32_8 542.9 357.6 (1.52x) 550.6 352.6 (1.56x) > avg_h264_qpel_4_mc33_8 339.4 191.1 (1.78x) 342.3 196.3 (1.74x) > avg_h264_qpel_8_mc00_8 116.8 42.9 (2.72x) 123.6 50.6 (2.44x) > avg_h264_qpel_8_mc01_8 774.4 163.1 (4.75x) 779.8 165.1 (4.72x) > avg_h264_qpel_8_mc02_8 774.4 154.1 (5.03x) 779.8 144.3 (5.40x) > avg_h264_qpel_8_mc03_8 774.4 163.3 (4.74x) 779.8 165.3 (4.72x) > avg_h264_qpel_8_mc10_8 617.1 237.3 (2.60x) 613.1 227.6 (2.69x) > avg_h264_qpel_8_mc11_8 1209.3 376.4 (3.21x) 1206.8 363.1 (3.32x) > avg_h264_qpel_8_mc12_8 1913.3 598.6 (3.20x) 1894.3 561.1 (3.38x) > avg_h264_qpel_8_mc13_8 1218.6 376.4 (3.24x) 1217.1 363.1 (3.35x) > avg_h264_qpel_8_mc20_8 524.4 228.1 (2.30x) 519.3 227.6 (2.28x) > avg_h264_qpel_8_mc21_8 1709.6 681.9 (2.51x) 1707.1 644.3 (2.65x) > avg_h264_qpel_8_mc22_8 1274.3 459.6 (2.77x) 1279.8 436.1 (2.93x) > avg_h264_qpel_8_mc23_8 1700.3 672.6 (2.53x) 1706.8 644.6 (2.65x) > avg_h264_qpel_8_mc30_8 607.6 246.6 (2.46x) 623.6 238.1 (2.62x) > avg_h264_qpel_8_mc31_8 1209.6 376.4 (3.21x) 1206.8 363.1 (3.32x) > avg_h264_qpel_8_mc32_8 1904.1 607.9 (3.13x) 1894.3 571.3 (3.32x) > avg_h264_qpel_8_mc33_8 1209.6 376.1 (3.22x) 1206.8 363.1 (3.32x) > avg_h264_qpel_16_mc00_8 431.9 89.1 (4.85x) 436.1 71.3 (6.12x) > avg_h264_qpel_16_mc01_8 2894.6 376.1 (7.70x) 2842.3 300.6 (9.46x) > avg_h264_qpel_16_mc02_8 2987.3 348.4 (8.57x) 2967.3 290.1 (10.23x) > avg_h264_qpel_16_mc03_8 2885.3 376.4 (7.67x) 2842.3 300.6 (9.46x) > avg_h264_qpel_16_mc10_8 2404.1 524.4 (4.58x) 2404.8 456.8 (5.26x) > avg_h264_qpel_16_mc11_8 4709.4 811.6 (5.80x) 4675.6 706.8 (6.62x) > avg_h264_qpel_16_mc12_8 7477.9 1274.3 (5.87x) 7436.1 1061.1 (7.01x) > avg_h264_qpel_16_mc13_8 4718.6 820.6 (5.75x) 4655.1 706.8 (6.59x) > avg_h264_qpel_16_mc20_8 2052.1 487.1 (4.21x) 2071.3 446.3 (4.64x) > avg_h264_qpel_16_mc21_8 7440.6 1422.6 (5.23x) 6727.8 1217.3 (5.53x) > avg_h264_qpel_16_mc22_8 5051.9 950.4 (5.32x) 5071.6 790.3 (6.42x) > avg_h264_qpel_16_mc23_8 6764.9 1422.3 (4.76x) 6748.6 1217.3 (5.54x) > avg_h264_qpel_16_mc30_8 2413.1 524.4 (4.60x) 2415.1 467.3 (5.17x) > avg_h264_qpel_16_mc31_8 4681.6 839.1 (5.58x) 4675.6 727.6 (6.43x) > avg_h264_qpel_16_mc32_8 8579.6 1292.8 (6.64x) 7436.3 1071.3 (6.94x) > avg_h264_qpel_16_mc33_8 5375.9 829.9 (6.48x) 4665.3 717.3 (6.50x) > put_h264_qpel_4_mc00_8 24.4 24.4 (1.00x) 29.8 29.6 (1.01x) > put_h264_qpel_4_mc01_8 987.4 79.8 (12.37x) 206.8 92.3 (2.24x) > put_h264_qpel_4_mc02_8 190.8 79.8 (2.39x) 196.6 81.8 (2.40x) > put_h264_qpel_4_mc03_8 209.6 89.1 (2.35x) 206.8 92.3 (2.24x) > put_h264_qpel_4_mc10_8 163.3 117.1 (1.39x) 175.6 123.3 (1.42x) > put_h264_qpel_4_mc11_8 339.4 181.6 (1.87x) 331.8 186.1 (1.78x) > put_h264_qpel_4_mc12_8 533.6 348.4 (1.53x) 529.8 342.3 (1.55x) > put_h264_qpel_4_mc13_8 339.4 190.8 (1.78x) 342.1 186.1 (1.84x) > put_h264_qpel_4_mc20_8 126.3 116.8 (1.08x) 123.6 123.6 (1.00x) > put_h264_qpel_4_mc21_8 468.9 376.1 (1.25x) 477.6 373.6 (1.28x) > put_h264_qpel_4_mc22_8 330.1 274.4 (1.20x) 331.8 269.6 (1.23x) > put_h264_qpel_4_mc23_8 468.9 376.1 (1.25x) 477.8 383.8 (1.24x) > put_h264_qpel_4_mc30_8 163.3 126.3 (1.29x) 165.1 123.6 (1.34x) > put_h264_qpel_4_mc31_8 339.1 191.1 (1.77x) 342.3 186.1 (1.84x) > put_h264_qpel_4_mc32_8 533.6 348.4 (1.53x) 529.8 352.6 (1.50x) > put_h264_qpel_4_mc33_8 339.4 181.8 (1.87x) 342.1 185.8 (1.84x) > put_h264_qpel_8_mc00_8 98.6 33.6 (2.93x) 92.3 40.1 (2.30x) > put_h264_qpel_8_mc01_8 737.1 153.8 (4.79x) 738.1 144.3 (5.12x) > put_h264_qpel_8_mc02_8 663.1 135.3 (4.90x) 665.1 134.1 (4.96x) > put_h264_qpel_8_mc03_8 737.4 154.1 (4.79x) 1508.8 144.3 (10.46x) > put_h264_qpel_8_mc10_8 598.4 237.1 (2.52x) 592.3 227.6 (2.60x) > put_h264_qpel_8_mc11_8 1172.3 357.9 (3.28x) 1175.6 342.3 (3.43x) > put_h264_qpel_8_mc12_8 1867.1 589.1 (3.17x) 1863.1 561.1 (3.32x) > put_h264_qpel_8_mc13_8 1172.6 366.9 (3.20x) 1175.6 352.8 (3.33x) > put_h264_qpel_8_mc20_8 450.4 218.8 (2.06x) 446.3 206.8 (2.16x) > put_h264_qpel_8_mc21_8 1672.3 663.1 (2.52x) 1675.6 633.8 (2.64x) > put_h264_qpel_8_mc22_8 1144.6 1200.1 (0.95x) 1144.3 425.6 (2.69x) > put_h264_qpel_8_mc23_8 1672.6 672.4 (2.49x) 1665.3 634.1 (2.63x) > put_h264_qpel_8_mc30_8 598.6 237.3 (2.52x) 613.1 227.6 (2.69x) > put_h264_qpel_8_mc31_8 1172.3 376.1 (3.12x) 1175.6 352.6 (3.33x) > put_h264_qpel_8_mc32_8 1857.8 598.6 (3.10x) 1863.1 561.1 (3.32x) > put_h264_qpel_8_mc33_8 1172.3 376.1 (3.12x) 1175.6 352.8 (3.33x) > put_h264_qpel_16_mc00_8 320.6 61.4 (5.22x) 321.3 60.8 (5.28x) > put_h264_qpel_16_mc01_8 2774.3 339.1 (8.18x) 2759.1 279.8 (9.86x) > put_h264_qpel_16_mc02_8 2589.1 320.6 (8.08x) 2571.6 269.3 (9.55x) > put_h264_qpel_16_mc03_8 2774.3 339.4 (8.17x) 2738.1 290.1 (9.44x) > put_h264_qpel_16_mc10_8 2274.3 487.4 (4.67x) 2290.1 436.1 (5.25x) > put_h264_qpel_16_mc11_8 5237.1 792.9 (6.60x) 4529.8 685.8 (6.61x) > put_h264_qpel_16_mc12_8 7357.6 1255.8 (5.86x) 7352.8 1040.1 (7.07x) > put_h264_qpel_16_mc13_8 4579.9 792.9 (5.78x) 4571.6 686.1 (6.66x) > put_h264_qpel_16_mc20_8 1802.1 459.6 (3.92x) 1800.6 425.6 (4.23x) > put_h264_qpel_16_mc21_8 6644.6 2246.6 (2.96x) 6644.3 1196.6 (5.55x) > put_h264_qpel_16_mc22_8 4589.1 913.4 (5.02x) 4592.3 769.3 (5.97x) > put_h264_qpel_16_mc23_8 6644.6 1394.6 (4.76x) 6634.1 1196.6 (5.54x) > put_h264_qpel_16_mc30_8 2274.3 496.6 (4.58x) 2290.1 456.8 (5.01x) > put_h264_qpel_16_mc31_8 5255.6 802.1 (6.55x) 4550.8 706.8 (6.44x) > put_h264_qpel_16_mc32_8 7376.1 1265.1 (5.83x) 7352.8 1050.6 (7.00x) > put_h264_qpel_16_mc33_8 4579.9 802.1 (5.71x) 4561.1 696.3 (6.55x)
Did you specialise all functions for 256-bit? it seems only a very small subset actually exhibit any significant performance gain from doing that. > > Signed-off-by: Niklas Haas <g...@haasn.dev> > Signed-off-by: J. Dekker <j...@itanimul.li> > --- > libavcodec/h264qpel.c | 2 + > libavcodec/h264qpel.h | 1 + > libavcodec/riscv/Makefile | 2 + > libavcodec/riscv/h264qpel_init.c | 113 ++++++++ > libavcodec/riscv/h264qpel_rvv.S | 467 +++++++++++++++++++++++++++++++ > 5 files changed, 585 insertions(+) > create mode 100644 libavcodec/riscv/h264qpel_init.c > create mode 100644 libavcodec/riscv/h264qpel_rvv.S > > diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c > index 65fef03304..faca1e8953 100644 > --- a/libavcodec/h264qpel.c > +++ b/libavcodec/h264qpel.c > @@ -102,6 +102,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int > bit_depth) ff_h264qpel_init_arm(c, bit_depth); > #elif ARCH_PPC > ff_h264qpel_init_ppc(c, bit_depth); > +#elif ARCH_RISCV > + ff_h264qpel_init_riscv(c, bit_depth); > #elif ARCH_X86 > ff_h264qpel_init_x86(c, bit_depth); > #elif ARCH_MIPS > diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h > index 0259e8de23..24baf826f9 100644 > --- a/libavcodec/h264qpel.h > +++ b/libavcodec/h264qpel.h > @@ -34,6 +34,7 @@ void ff_h264qpel_init(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth); > +void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_mips(H264QpelContext *c, int bit_depth); > void ff_h264qpel_init_loongarch(H264QpelContext *c, int bit_depth); > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 27befce929..1f1fa03329 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -33,6 +33,8 @@ RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o > OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o > RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ > riscv/h264idct_rvv.o > +OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o > +RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o > OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o > RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o > OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o > diff --git a/libavcodec/riscv/h264qpel_init.c > b/libavcodec/riscv/h264qpel_init.c new file mode 100644 > index 0000000000..69a1345447 > --- /dev/null > +++ b/libavcodec/riscv/h264qpel_init.c > @@ -0,0 +1,113 @@ > +/* > + * RISC-V optimised DSP functions > + * Copyright (c) 2024 Niklas Haas > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include <stdint.h> > + > +#include "config.h" > +#include "libavutil/attributes.h" > +#include "libavutil/riscv/cpu.h" > +#include "libavcodec/h264qpel.h" > + > +#define DECL_QPEL_OPS(OP, SIZE, EXT) > \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc00_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc10_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc20_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc30_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc01_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc11_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc21_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc31_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc02_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc12_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc22_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc32_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc03_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc13_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); \ +void ff_ ## OP ## _h264_qpel ## SIZE ## > _mc23_ ## EXT(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); \ +void > ff_ ## OP ## _h264_qpel ## SIZE ## _mc33_ ## EXT(uint8_t *dst, const > uint8_t *src, ptrdiff_t stride); + > +DECL_QPEL_OPS(put, 16, rvv256) > +DECL_QPEL_OPS(put, 8, rvv256) > +DECL_QPEL_OPS(put, 4, rvv256) > + > +DECL_QPEL_OPS(avg, 16, rvv256) > +DECL_QPEL_OPS(avg, 8, rvv256) > +DECL_QPEL_OPS(avg, 4, rvv256) > + > +DECL_QPEL_OPS(put, 16, rvv) > +DECL_QPEL_OPS(put, 8, rvv) > +DECL_QPEL_OPS(put, 4, rvv) > + > +DECL_QPEL_OPS(avg, 16, rvv) > +DECL_QPEL_OPS(avg, 8, rvv) > +DECL_QPEL_OPS(avg, 4, rvv) > + > +#define SET_QPEL_FNS(OP, IDX, SIZE, EXT) > \ +do { > \ + c->OP ## > _h264_qpel_pixels_tab[IDX][ 0] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc00_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 1] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc10_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][ 2] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc20_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 3] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc30_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][ 4] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc01_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 5] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc11_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][ 6] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc21_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 7] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc31_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][ 8] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc02_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][ 9] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc12_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][10] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc22_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][11] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc32_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][12] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc03_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][13] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc13_ ## EXT; \ + c->OP ## > _h264_qpel_pixels_tab[IDX][14] = ff_ ## OP ## _h264_qpel ## SIZE ## _mc23_ > ## EXT; \ + c->OP ## _h264_qpel_pixels_tab[IDX][15] = ff_ ## OP ## > _h264_qpel ## SIZE ## _mc33_ ## EXT; \ +} while (0) > + > +av_cold void ff_h264qpel_init_riscv(H264QpelContext *c, int bit_depth) > +{ > +#if HAVE_RVV > + int flags = av_get_cpu_flags(); > + if (flags & AV_CPU_FLAG_RVV_I32) { > + const int vlen = 8 * ff_get_rv_vlenb(); > + > + switch (bit_depth) { > + case 8: > + if (vlen >= 256) { > + SET_QPEL_FNS(put, 0, 16, rvv256); > + SET_QPEL_FNS(put, 1, 8, rvv256); > + SET_QPEL_FNS(put, 2, 4, rvv256); > + > + SET_QPEL_FNS(avg, 0, 16, rvv256); > + SET_QPEL_FNS(avg, 1, 8, rvv256); > + SET_QPEL_FNS(avg, 2, 4, rvv256); > + } else if (vlen >= 128) { > + SET_QPEL_FNS(put, 0, 16, rvv); > + SET_QPEL_FNS(put, 1, 8, rvv); > + SET_QPEL_FNS(put, 2, 4, rvv); > + > + SET_QPEL_FNS(avg, 0, 16, rvv); > + SET_QPEL_FNS(avg, 1, 8, rvv); > + SET_QPEL_FNS(avg, 2, 4, rvv); > + } > + break; > + } > + } > +#endif > +} > diff --git a/libavcodec/riscv/h264qpel_rvv.S > b/libavcodec/riscv/h264qpel_rvv.S new file mode 100644 > index 0000000000..452796e59f > --- /dev/null > +++ b/libavcodec/riscv/h264qpel_rvv.S > @@ -0,0 +1,467 @@ > +/* > + * SPDX-License-Identifier: BSD-2-Clause > + * > + * Copyright (c) 2024 Niklas Haas > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions are > met: + * > + * 1. Redistributions of source code must retain the above copyright > notice, + * this list of conditions and the following disclaimer. > + * > + * 2. Redistributions in binary form must reproduce the above copyright > notice, + * this list of conditions and the following disclaimer in the > documentation + * and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS > IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED > TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A > PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY > THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT > (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE > OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro lx rd, addr > +#if (__riscv_xlen == 32) > + lw \rd, \addr > +#elif (__riscv_xlen == 64) > + ld \rd, \addr > +#else > + lq \rd, \addr > +#endif > +.endm > + > +.macro sx rd, addr > +#if (__riscv_xlen == 32) > + sw \rd, \addr > +#elif (__riscv_xlen == 64) > + sd \rd, \addr > +#else > + sq \rd, \addr > +#endif > +.endm > + > + /* output is unclipped; clobbers v26-v31 plus t0 and t02 */ > +.macro lowpass_h vdst, src > + addi t4, \src, 3 > + lbu t5, 2(\src) > + vle8.v v31, (t4) > + lbu t4, 1(\src) > + vslide1up.vx v30, v31, t5 > + lbu t5, 0(\src) > + vslide1up.vx v29, v30, t4 > + lbu t4, -1(\src) > + vslide1up.vx v28, v29, t5 > + lbu t5, -2(\src) > + vslide1up.vx v27, v28, t4 > + vslide1up.vx v26, v27, t5 > + vwaddu.vv \vdst, v26, v31 > + vwmaccu.vx \vdst, t6, v28 > + vwmaccu.vx \vdst, t6, v29 > + vwmaccsu.vx \vdst, a7, v27 > + vwmaccsu.vx \vdst, a7, v30 > +.endm > + > + /* output is unclipped */ > +.macro lowpass_v vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, > signed=0 + .if \signed > + vwadd.vv \vdst, \vsrc0, \vsrc5 > + vwmacc.vx \vdst, t6, \vsrc2 > + vwmacc.vx \vdst, t6, \vsrc3 > + vwmacc.vx \vdst, a7, \vsrc1 > + vwmacc.vx \vdst, a7, \vsrc4 > + .else > + vwaddu.vv \vdst, \vsrc0, \vsrc5 > + vwmaccu.vx \vdst, t6, \vsrc2 > + vwmaccu.vx \vdst, t6, \vsrc3 > + vwmaccsu.vx \vdst, a7, \vsrc1 > + vwmaccsu.vx \vdst, a7, \vsrc4 > + .endif > +.endm > + > +.macro qpel_mc00 op, dst, src, stride, size > +func ff_\op\()_h264_qpel_pixels, zve32x > +1: add t1, a2, a1 > + add t2, a2, t1 > + add t3, a2, t2 > + vle8.v v0, (a1) > + vle8.v v1, (t1) > + vle8.v v2, (t2) > + vle8.v v3, (t3) > + addi a4, a4, -4 > + add a1, a2, t3 > + add t1, a2, a0 > + add t2, a2, t1 > + add t3, a2, t2 > + .ifc \op, avg > + vle8.v v4, (a0) > + vle8.v v5, (t1) > + vle8.v v6, (t2) > + vle8.v v7, (t3) > + vaaddu.vv v0, v0, v4 > + vaaddu.vv v1, v1, v5 > + vaaddu.vv v2, v2, v6 > + vaaddu.vv v3, v3, v7 > + .endif > + vse8.v v0, (a0) > + vse8.v v1, (t1) > + vse8.v v2, (t2) > + vse8.v v3, (t3) > + add a0, a2, t3 > + bnez a4, 1b > + jr t0 > +endfunc > +.endm > + > + qpel_mc00 put, a0, a1, a2, a4 > + qpel_mc00 avg, a0, a1, a2, a4 > + > +.macro qpel_lowpass op, ext, lmul, lmul2 > +func ff_\op\()_h264_qpel_h_lowpass_\lmul\ext, zve32x > +1: lpad 0 > + add t1, a3, a1 > + add t2, a3, t1 > + add t3, a3, t2 > + lowpass_h v0, a1 > + lowpass_h v2, t1 > + lowpass_h v4, t2 > + lowpass_h v6, t3 > + add a1, a3, t3 > + addi a4, a4, -4 > + vsetvli zero, zero, e16, \lmul2, ta, ma > + vmax.vx v0, v0, zero > + vmax.vx v2, v2, zero > + vmax.vx v4, v4, zero > + vmax.vx v6, v6, zero > + vsetvli zero, zero, e8, \lmul, ta, ma > + vnclipu.wi v0, v0, 5 > + vnclipu.wi v2, v2, 5 > + vnclipu.wi v4, v4, 5 > + vnclipu.wi v6, v6, 5 > + .ifc \ext, _l2 > + add t1, a6, a5 > + add t2, a6, t1 > + add t3, a6, t2 > + vle8.v v8, (a5) > + vle8.v v10, (t1) > + vle8.v v12, (t2) > + vle8.v v14, (t3) > + add a5, a2, t3 > + vaaddu.vv v0, v0, v8 > + vaaddu.vv v2, v2, v10 > + vaaddu.vv v4, v4, v12 > + vaaddu.vv v6, v6, v14 > + .endif > + add t1, a2, a0 > + add t2, a2, t1 > + add t3, a2, t2 > + .ifc \op, avg > + vle8.v v1, (a0) > + vle8.v v3, (t1) > + vle8.v v5, (t2) > + vle8.v v7, (t3) > + vaaddu.vv v0, v0, v1 > + vaaddu.vv v2, v2, v3 > + vaaddu.vv v4, v4, v5 > + vaaddu.vv v6, v6, v7 > + .endif > + vse8.v v0, (a0) > + vse8.v v2, (t1) > + vse8.v v4, (t2) > + vse8.v v6, (t3) > + add a0, a2, t3 > + bnez a4, 1b > + jr t0 > +endfunc > + > +func ff_\op\()_h264_qpel_v_lowpass_\lmul\ext, zve32x > + lpad 0 Is this really address-taken? > + sub t1, a1, a3 > + sub t2, t1, a3 > + vle8.v v2, (a1) > + vle8.v v1, (t1) > + vle8.v v0, (t2) > + add t1, a1, a3 > + add t2, t1, a3 > + add a1, t2, a3 > + vle8.v v3, (t1) > + vle8.v v4, (t2) > +1: add t1, a3, a1 > + add t2, a3, t1 > + add t3, a3, t2 > + vle8.v v5, (a1) > + vle8.v v6, (t1) > + vle8.v v7, (t2) > + vle8.v v8, (t3) > + add a1, a3, t3 > + lowpass_v v24, v0, v1, v2, v3, v4, v5 > + lowpass_v v26, v1, v2, v3, v4, v5, v6 > + lowpass_v v28, v2, v3, v4, v5, v6, v7 > + lowpass_v v30, v3, v4, v5, v6, v7, v8 > + addi a4, a4, -4 > + vsetvli zero, zero, e16, \lmul2, ta, ma > + vmax.vx v24, v24, zero > + vmax.vx v26, v26, zero > + vmax.vx v28, v28, zero > + vmax.vx v30, v30, zero > + vsetvli zero, zero, e8, \lmul, ta, ma > + vnclipu.wi v24, v24, 5 > + vnclipu.wi v26, v26, 5 > + vnclipu.wi v28, v28, 5 > + vnclipu.wi v30, v30, 5 > + .ifc \ext, _l2 > + add t1, a6, a5 > + add t2, a6, t1 > + add t3, a6, t2 > + vle8.v v9, (a5) > + vle8.v v10, (t1) > + vle8.v v11, (t2) > + vle8.v v12, (t3) > + add a5, a6, t3 > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + add t1, a2, a0 > + add t2, a2, t1 > + add t3, a2, t2 > + .ifc \op, avg > + vle8.v v9, (a0) > + vle8.v v10, (t1) > + vle8.v v11, (t2) > + vle8.v v12, (t3) > + vaaddu.vv v24, v24, v9 > + vaaddu.vv v26, v26, v10 > + vaaddu.vv v28, v28, v11 > + vaaddu.vv v30, v30, v12 > + .endif > + vse8.v v24, (a0) > + vse8.v v26, (t1) > + vse8.v v28, (t2) > + vse8.v v30, (t3) > + add a0, a2, t3 > + vmv.v.v v0, v4 > + vmv.v.v v1, v5 > + vmv.v.v v2, v6 > + vmv.v.v v3, v7 > + vmv.v.v v4, v8 > + bnez a4, 1b > + jr t0 > +endfunc > + > +func ff_\op\()_h264_qpel_hv_lowpass_\lmul\ext, zve32x > + lpad 0 Ditto. > + sub t1, a1, a3 > + sub t2, t1, a3 > + lowpass_h v4, a1 > + lowpass_h v2, t1 > + lowpass_h v0, t2 > + add t1, a1, a3 > + add t2, t1, a3 > + add a1, t2, a3 > + lowpass_h v6, t1 > + lowpass_h v8, t2 > +1: add t1, a3, a1 > + add t2, a3, t1 > + add t3, a3, t2 > + lowpass_h v10, a1 > + lowpass_h v12, t1 > + lowpass_h v14, t2 > + lowpass_h v16, t3 > + vsetvli zero, zero, e16, \lmul2, ta, ma > + addi a4, a4, -4 > + lowpass_v v20, v0, v2, v4, v6, v8, v10, signed=1 > + lowpass_v v24, v2, v4, v6, v8, v10, v12, signed=1 > + lowpass_v v28, v4, v6, v8, v10, v12, v14, signed=1 > + vnclip.wi v0, v20, 10 > + lowpass_v v20, v6, v8, v10, v12, v14, v16, signed=1 > + vnclip.wi v2, v24, 10 > + vnclip.wi v4, v28, 10 > + vnclip.wi v6, v20, 10 > + vmax.vx v18, v0, zero > + vmax.vx v20, v2, zero > + vmax.vx v22, v4, zero > + vmax.vx v24, v6, zero > + vmv.v.v v0, v8 > + vmv.v.v v2, v10 > + vmv.v.v v4, v12 > + vmv.v.v v6, v14 > + vmv.v.v v8, v16 > + add a1, a3, t3 > + vsetvli zero, zero, e8, \lmul, ta, ma > + vnclipu.wi v18, v18, 0 > + vnclipu.wi v20, v20, 0 > + vnclipu.wi v22, v22, 0 > + vnclipu.wi v24, v24, 0 > + .ifc \ext, _l2 > + add t1, a6, a5 > + add t2, a6, t1 > + add t3, a6, t2 > + vle8.v v26, (a5) > + vle8.v v27, (t1) > + vle8.v v28, (t2) > + vle8.v v29, (t3) > + add a5, a6, t3 > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + add t1, a2, a0 > + add t2, a2, t1 > + add t3, a2, t2 > + .ifc \op, avg > + vle8.v v26, (a0) > + vle8.v v27, (t1) > + vle8.v v28, (t2) > + vle8.v v29, (t3) > + vaaddu.vv v18, v18, v26 > + vaaddu.vv v20, v20, v27 > + vaaddu.vv v22, v22, v28 > + vaaddu.vv v24, v24, v29 > + .endif > + vse8.v v18, (a0) > + vse8.v v20, (t1) > + vse8.v v22, (t2) > + vse8.v v24, (t3) > + add a0, a2, t3 > + bnez a4, 1b > + jr t0 > +endfunc > +.endm > + > +/* Note: We could possibly specialize for the width 8 / width 4 cases by > + loading 32 bit integers, but this makes the convolutions more > complicated + to implement, so it's not necessarily any faster. */ > + > +.macro h264_qpel lmul, lmul2 > + qpel_lowpass put, , \lmul, \lmul2 > + qpel_lowpass put, _l2, \lmul, \lmul2 > + qpel_lowpass avg, , \lmul, \lmul2 > + qpel_lowpass avg, _l2, \lmul, \lmul2 > +.endm > + > + h264_qpel m1, m2 > + h264_qpel mf2, m1 > + h264_qpel mf4, mf2 > + h264_qpel mf8, mf4 > + > +.macro h264_qpel_1pass op, case, lmul, size, ext=rvv, dir, offset > +func ff_\op\()_h264_qpel\size\()_\case\()_\ext, zve32x > + lpad 0 > + vsetivli zero, \size, e8, \lmul, ta, ma > + csrwi vxrm, 0 > + li a4, \size > + li t6, 20 > + li a7, -5 > + mv a3, a2 > + mv t0, ra > +.ifnb \offset > + .ifc \dir, v > + add a5, a1, \offset > + .else > + addi a5, a1, \offset > + .endif > + mv a6, a3 > + j ff_\op\()_h264_qpel_\dir\()_lowpass_\lmul\()_l2 > +.else > + j ff_\op\()_h264_qpel_\dir\()_lowpass_\lmul\() > +.endif > +endfunc > +.endm > + > +.macro h264_qpel_2pass op, case, lmul, size, ext=rvv, dir1, dir2, off1=0, > off2 +func ff_\op\()_h264_qpel\size\()_\case\()_\ext, zve32x > + lpad 0 > + vsetivli zero, \size, e8, \lmul, ta, ma > + csrwi vxrm, 0 > + addi sp, sp, -16 > + li a4, \size > + li t6, 20 > + li a7, -5 > + sx a0, 0(sp) > + sx a1, 8(sp) Won't work if XLEN = 128. > + .ifc \off1, a2 > + add a1, a1, \off1 > + .elseif \off1 > + addi a1, a1, \off1 > + .endif > + mv a3, a2 > + .ifc \op, avg > + // Use temporary array on stack for the first pass > + addi a0, sp, -(\size * \size) > + li a2, \size > + .endif > + jal t0, ff_put_h264_qpel_\dir1\()_lowpass_\lmul > + lx a0, 0(sp) > + lx a1, 8(sp) > + .ifc \op, put > + // Directly reuse the first pass output buffer > + mv a5, a0 > + mv a6, a2 > + .else > + addi a5, sp, -(\size * \size) > + li a6, \size > + mv a2, a3 > + .endif > + .ifnb \off2 > + addi a1, a1, \off2 > + .endif > + li a4, \size > + mv t0, ra > + addi sp, sp, 16 > + j ff_\op\()_h264_qpel_\dir2\()_lowpass_\lmul\()_l2 > +endfunc > +.endm > + > +.macro ff_h264_qpel_fns op, lmul, size, ext=rvv > +func ff_\op\()_h264_qpel\size\()_mc00_\ext, zve32x > + lpad 0 > + vsetivli zero, \size, e8, \lmul, ta, ma > + csrwi vxrm, 0 > + li a4, \size > + mv t0, ra > + j ff_\op\()_h264_qpel_pixels > +endfunc > + > + h264_qpel_1pass \op, mc20, \lmul, \size, \ext, h > + h264_qpel_1pass \op, mc02, \lmul, \size, \ext, v > + h264_qpel_1pass \op, mc10, \lmul, \size, \ext, h, 0 > + h264_qpel_1pass \op, mc30, \lmul, \size, \ext, h, 1 > + h264_qpel_1pass \op, mc01, \lmul, \size, \ext, v, zero > + h264_qpel_1pass \op, mc03, \lmul, \size, \ext, v, a2 > + h264_qpel_1pass \op, mc22, \lmul, \size, \ext, hv > + > + h264_qpel_2pass \op, mc11, \lmul, \size, \ext, h, v > + h264_qpel_2pass \op, mc21, \lmul, \size, \ext, h, hv > + h264_qpel_2pass \op, mc12, \lmul, \size, \ext, v, hv > + h264_qpel_2pass \op, mc31, \lmul, \size, \ext, h, v, off2=1 > + h264_qpel_2pass \op, mc13, \lmul, \size, \ext, h, v, a2 > + h264_qpel_2pass \op, mc33, \lmul, \size, \ext, h, v, a2, 1 > + h264_qpel_2pass \op, mc23, \lmul, \size, \ext, h, hv, a2 > + h264_qpel_2pass \op, mc32, \lmul, \size, \ext, v, hv, 1 > +.endm > + > + ff_h264_qpel_fns put, mf2, 16, rvv256 > + ff_h264_qpel_fns put, mf4, 8, rvv256 > + ff_h264_qpel_fns put, mf8, 4, rvv256 > + > + ff_h264_qpel_fns avg, mf2, 16, rvv256 > + ff_h264_qpel_fns avg, mf4, 8, rvv256 > + ff_h264_qpel_fns avg, mf8, 4, rvv256 > + > + ff_h264_qpel_fns put, m1, 16, rvv > + ff_h264_qpel_fns put, mf2, 8, rvv > + ff_h264_qpel_fns put, mf4, 4, rvv > + > + ff_h264_qpel_fns avg, m1, 16, rvv > + ff_h264_qpel_fns avg, mf2, 8, rvv > + ff_h264_qpel_fns avg, mf4, 4, rvv -- レミ・デニ-クールモン http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".