> 2022年8月29日 20:30,Andreas Rheinhardt <andreas.rheinha...@outlook.com> 写道: > > Hao Chen: >> ffmpeg -i ~/media/1_h264_1080p_30fps_3Mbps.mp4 -f rawvideo -s 640x480 >> -pix_fmt >> rgb24 -y /dev/null -an >> before: 150fps >> after: 183fps >> >> Signed-off-by: Hao Chen <chen...@loongson.cn> >> --- >> libswscale/loongarch/Makefile | 3 +- >> libswscale/loongarch/output_lasx.c | 1982 +++++++++++++++++ >> libswscale/loongarch/swscale_init_loongarch.c | 3 + >> libswscale/loongarch/swscale_loongarch.h | 6 + >> 4 files changed, 1993 insertions(+), 1 deletion(-) >> create mode 100644 libswscale/loongarch/output_lasx.c >> >> diff --git a/libswscale/loongarch/Makefile b/libswscale/loongarch/Makefile >> index 4345971514..54d48b3de0 100644 >> --- a/libswscale/loongarch/Makefile >> +++ b/libswscale/loongarch/Makefile >> @@ -2,4 +2,5 @@ OBJS-$(CONFIG_SWSCALE) += >> loongarch/swscale_init_loongarch.o >> LASX-OBJS-$(CONFIG_SWSCALE) += loongarch/swscale_lasx.o \ >> loongarch/input_lasx.o \ >> loongarch/yuv2rgb_lasx.o \ >> - loongarch/rgb2rgb_lasx.o >> + loongarch/rgb2rgb_lasx.o \ >> + >> loongarch/output_lasx.o >> diff --git a/libswscale/loongarch/output_lasx.c >> b/libswscale/loongarch/output_lasx.c >> new file mode 100644 >> index 0000000000..19f82692ff >> --- /dev/null >> +++ b/libswscale/loongarch/output_lasx.c >> @@ -0,0 +1,1982 @@ >> +/* >> + * Copyright (C) 2022 Loongson Technology Corporation Limited >> + * Contributed by Hao Chen(chen...@loongson.cn) >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "swscale_loongarch.h" >> +#include "libavutil/loongarch/loongson_intrinsics.h" >> + >> +void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize, >> + const int16_t **src, uint8_t *dest, int dstW, >> + const uint8_t *dither, int offset) >> +{ >> + int i; >> + int len = dstW - 15; >> + __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400, >> + 0x1C0C180814041000, 0x1C1814100C080400}; >> + __m256i val1, val2, val3; >> + uint8_t dither0 = dither[offset & 7]; >> + uint8_t dither1 = dither[(offset + 1) & 7]; >> + uint8_t dither2 = dither[(offset + 2) & 7]; >> + uint8_t dither3 = dither[(offset + 3) & 7]; >> + uint8_t dither4 = dither[(offset + 4) & 7]; >> + uint8_t dither5 = dither[(offset + 5) & 7]; >> + uint8_t dither6 = dither[(offset + 6) & 7]; >> + uint8_t dither7 = dither[(offset + 7) & 7]; >> + int val_1[8] = {dither0, dither2, dither4, dither6, >> + dither0, dither2, dither4, dither6}; >> + int val_2[8] = {dither1, dither3, dither5, dither7, >> + dither1, dither3, dither5, dither7}; >> + int val_3[8] = {dither0, dither1, dither2, dither3, >> + dither4, dither5, dither6, dither7}; >> + >> + DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2); >> + val3 = __lasx_xvld(val_3, 0); >> + >> + for (i = 0; i < len; i += 16) { >> + int j; >> + __m256i src0, filter0, val; >> + __m256i val_ev, val_od; >> + >> + val_ev = __lasx_xvslli_w(val1, 12); >> + val_od = __lasx_xvslli_w(val2, 12); >> + >> + for (j = 0; j < filterSize; j++) { >> + src0 = __lasx_xvld(src[j]+ i, 0); >> + filter0 = __lasx_xvldrepl_h((filter + j), 0); >> + val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0); >> + val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0); >> + } >> + val_ev = __lasx_xvsrai_w(val_ev, 19); >> + val_od = __lasx_xvsrai_w(val_od, 19); >> + val_ev = __lasx_xvclip255_w(val_ev); >> + val_od = __lasx_xvclip255_w(val_od); >> + val = __lasx_xvshuf_b(val_od, val_ev, mask); >> + __lasx_xvstelm_d(val, (dest + i), 0, 0); >> + __lasx_xvstelm_d(val, (dest + i), 8, 2); >> + } >> + if (dstW - i >= 8){ >> + int j; >> + __m256i src0, filter0, val_h; >> + __m256i val_l; >> + >> + val_l = __lasx_xvslli_w(val3, 12); >> + >> + for (j = 0; j < filterSize; j++) { >> + src0 = __lasx_xvld(src[j] + i, 0); >> + src0 = __lasx_vext2xv_w_h(src0); >> + filter0 = __lasx_xvldrepl_h((filter + j), 0); >> + filter0 = __lasx_vext2xv_w_h(filter0); >> + val_l = __lasx_xvmadd_w(val_l, src0, filter0); >> + } >> + val_l = __lasx_xvsrai_w(val_l, 19); >> + val_l = __lasx_xvclip255_w(val_l); >> + val_h = __lasx_xvpermi_d(val_l, 0x4E); >> + val_l = __lasx_xvshuf_b(val_h, val_l, mask); >> + __lasx_xvstelm_d(val_l, (dest + i), 0, 1); >> + i += 8; >> + } >> + for (; i < dstW; i++) { >> + int val = dither[(i + offset) & 7] << 12; >> + int j; >> + for (j = 0; j< filterSize; j++) >> + val += src[j][i] * filter[j]; >> + >> + dest[i] = av_clip_uint8(val >> 19); >> + } >> +} >> + >> +/*Copy from libswscale/output.c*/ >> +static av_always_inline void >> +yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2, >> + unsigned A1, unsigned A2, >> + const void *_r, const void *_g, const void *_b, int y, >> + enum AVPixelFormat target, int hasAlpha) >> +{ >> + if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA || >> + target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) { >> + uint32_t *dest = (uint32_t *) _dest; >> + const uint32_t *r = (const uint32_t *) _r; >> + const uint32_t *g = (const uint32_t *) _g; >> + const uint32_t *b = (const uint32_t *) _b; >> + >> +#if CONFIG_SMALL >> + dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1]; >> + dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2]; >> +#else >> +#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1 >> + int sh = (target == AV_PIX_FMT_RGB32_1 || >> + target == AV_PIX_FMT_BGR32_1) ? 0 : 24; >> + av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF); >> +#endif >> + dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1]; >> + dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2]; >> +#endif >> + } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) { >> + uint8_t *dest = (uint8_t *) _dest; >> + const uint8_t *r = (const uint8_t *) _r; >> + const uint8_t *g = (const uint8_t *) _g; >> + const uint8_t *b = (const uint8_t *) _b; >> + >> +#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b) >> +#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r) >> + >> + dest[i * 6 + 0] = r_b[Y1]; >> + dest[i * 6 + 1] = g[Y1]; >> + dest[i * 6 + 2] = b_r[Y1]; >> + dest[i * 6 + 3] = r_b[Y2]; >> + dest[i * 6 + 4] = g[Y2]; >> + dest[i * 6 + 5] = b_r[Y2]; >> +#undef r_b >> +#undef b_r >> + } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 || >> + target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 || >> + target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) { >> + uint16_t *dest = (uint16_t *) _dest; >> + const uint16_t *r = (const uint16_t *) _r; >> + const uint16_t *g = (const uint16_t *) _g; >> + const uint16_t *b = (const uint16_t *) _b; >> + int dr1, dg1, db1, dr2, dg2, db2; >> + >> + if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) { >> + dr1 = ff_dither_2x2_8[ y & 1 ][0]; >> + dg1 = ff_dither_2x2_4[ y & 1 ][0]; >> + db1 = ff_dither_2x2_8[(y & 1) ^ 1][0]; >> + dr2 = ff_dither_2x2_8[ y & 1 ][1]; >> + dg2 = ff_dither_2x2_4[ y & 1 ][1]; >> + db2 = ff_dither_2x2_8[(y & 1) ^ 1][1]; >> + } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) { >> + dr1 = ff_dither_2x2_8[ y & 1 ][0]; >> + dg1 = ff_dither_2x2_8[ y & 1 ][1]; >> + db1 = ff_dither_2x2_8[(y & 1) ^ 1][0]; >> + dr2 = ff_dither_2x2_8[ y & 1 ][1]; >> + dg2 = ff_dither_2x2_8[ y & 1 ][0]; >> + db2 = ff_dither_2x2_8[(y & 1) ^ 1][1]; >> + } else { >> + dr1 = ff_dither_4x4_16[ y & 3 ][0]; >> + dg1 = ff_dither_4x4_16[ y & 3 ][1]; >> + db1 = ff_dither_4x4_16[(y & 3) ^ 3][0]; >> + dr2 = ff_dither_4x4_16[ y & 3 ][1]; >> + dg2 = ff_dither_4x4_16[ y & 3 ][0]; >> + db2 = ff_dither_4x4_16[(y & 3) ^ 3][1]; >> + } >> + >> + dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1]; >> + dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]; >> + } else /* 8/4 bits */ { >> + uint8_t *dest = (uint8_t *) _dest; >> + const uint8_t *r = (const uint8_t *) _r; >> + const uint8_t *g = (const uint8_t *) _g; >> + const uint8_t *b = (const uint8_t *) _b; >> + int dr1, dg1, db1, dr2, dg2, db2; >> + >> + if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) { >> + const uint8_t * const d64 = ff_dither_8x8_73[y & 7]; >> + const uint8_t * const d32 = ff_dither_8x8_32[y & 7]; >> + dr1 = dg1 = d32[(i * 2 + 0) & 7]; >> + db1 = d64[(i * 2 + 0) & 7]; >> + dr2 = dg2 = d32[(i * 2 + 1) & 7]; >> + db2 = d64[(i * 2 + 1) & 7]; >> + } else { >> + const uint8_t * const d64 = ff_dither_8x8_73 [y & 7]; >> + const uint8_t * const d128 = ff_dither_8x8_220[y & 7]; >> + dr1 = db1 = d128[(i * 2 + 0) & 7]; >> + dg1 = d64[(i * 2 + 0) & 7]; >> + dr2 = db2 = d128[(i * 2 + 1) & 7]; >> + dg2 = d64[(i * 2 + 1) & 7]; >> + } >> + >> + if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) { >> + dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] + >> + ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4); >> + } else { >> + dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1]; >> + dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]; >> + } >> + } >> +} >> + >> +#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \ >> +{ \ >> + Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \ >> + Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \ >> + U = __lasx_xvpickve2gr_w(vec_u, t3); \ >> + V = __lasx_xvpickve2gr_w(vec_v, t4); \ >> + r = c->table_rV[V]; \ >> + g = (c->table_gU[U] + c->table_gV[V]); \ >> + b = c->table_bU[U]; \ >> + yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \ >> + r, g, b, y, target, 0); \ >> + count++; \ >> +} >> + >> +static void >> +yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter, >> + const int16_t **lumSrc, int lumFilterSize, >> + const int16_t *chrFilter, const int16_t **chrUSrc, >> + const int16_t **chrVSrc, int chrFilterSize, >> + const int16_t **alpSrc, uint8_t *dest, int dstW, >> + int y, enum AVPixelFormat target, int hasAlpha) >> +{ >> + int i, j; >> + int count = 0; >> + int t = 1 << 18; >> + int len = dstW >> 6; >> + int res = dstW & 63; >> + int len_count = (dstW + 1) >> 1; >> + const void *r, *g, *b; >> + int head = YUVRGB_TABLE_HEADROOM; >> + __m256i headroom = __lasx_xvreplgr2vr_w(head); >> + >> + for (i = 0; i < len; i++) { >> + int Y1, Y2, U, V, count_lum = count << 1; >> + __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, >> v_src2; >> + __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, >> yh2_od; >> + __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, >> temp; >> + >> + yl1_ev = __lasx_xvldrepl_w(&t, 0); >> + yl1_od = yl1_ev; >> + yh1_ev = yl1_ev; >> + yh1_od = yl1_ev; >> + u1_ev = yl1_ev; >> + v1_ev = yl1_ev; >> + u1_od = yl1_ev; >> + v1_od = yl1_ev; >> + yl2_ev = yl1_ev; >> + yl2_od = yl1_ev; >> + yh2_ev = yl1_ev; >> + yh2_od = yl1_ev; >> + u2_ev = yl1_ev; >> + v2_ev = yl1_ev; >> + u2_od = yl1_ev; >> + v2_od = yl1_ev; >> + for (j = 0; j < lumFilterSize; j++) { >> + int16_t *src_lum = lumSrc[j] + count_lum; >> + temp = __lasx_xvldrepl_h((lumFilter + j), 0); >> + DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64, >> + src_lum, 96, l_src1, l_src2, l_src3, l_src4); >> + >> + yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1); >> + yl1_od = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1); >> + yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2); >> + yh1_od = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2); >> + yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3); >> + yl2_od = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3); >> + yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4); >> + yh2_od = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4); >> + } >> + for (j = 0; j < chrFilterSize; j++) { >> + DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + >> count, 32, >> + u_src1, u_src2); >> + DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + >> count, 32, >> + v_src1, v_src2); >> + temp = __lasx_xvldrepl_h((chrFilter + j), 0); >> + u1_ev = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1); >> + u1_od = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1); >> + v1_ev = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1); >> + v1_od = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1); >> + u2_ev = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2); >> + u2_od = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2); >> + v2_ev = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2); >> + v2_od = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2); >> + } >> + yl1_ev = __lasx_xvsrai_w(yl1_ev, 19); >> + yh1_ev = __lasx_xvsrai_w(yh1_ev, 19); >> + yl1_od = __lasx_xvsrai_w(yl1_od, 19); >> + yh1_od = __lasx_xvsrai_w(yh1_od, 19); >> + u1_ev = __lasx_xvsrai_w(u1_ev, 19); >> + v1_ev = __lasx_xvsrai_w(v1_ev, 19); >> + u1_od = __lasx_xvsrai_w(u1_od, 19); >> + v1_od = __lasx_xvsrai_w(v1_od, 19); >> + yl2_ev = __lasx_xvsrai_w(yl2_ev, 19); >> + yh2_ev = __lasx_xvsrai_w(yh2_ev, 19); >> + yl2_od = __lasx_xvsrai_w(yl2_od, 19); >> + yh2_od = __lasx_xvsrai_w(yh2_od, 19); >> + u2_ev = __lasx_xvsrai_w(u2_ev, 19); >> + v2_ev = __lasx_xvsrai_w(v2_ev, 19); >> + u2_od = __lasx_xvsrai_w(u2_od, 19); >> + v2_od = __lasx_xvsrai_w(v2_od, 19); >> + u1_ev = __lasx_xvadd_w(u1_ev, headroom); >> + v1_ev = __lasx_xvadd_w(v1_ev, headroom); >> + u1_od = __lasx_xvadd_w(u1_od, headroom); >> + v1_od = __lasx_xvadd_w(v1_od, headroom); >> + u2_ev = __lasx_xvadd_w(u2_ev, headroom); >> + v2_ev = __lasx_xvadd_w(v2_ev, headroom); >> + u2_od = __lasx_xvadd_w(u2_od, headroom); >> + v2_od = __lasx_xvadd_w(v2_od, headroom); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3); >> + WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7); >> + WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3); >> + WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7); >> + WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7); >> + } >> + if (res >= 32) { >> + int Y1, Y2, U, V, count_lum = count << 1; >> + __m256i l_src1, l_src2, u_src, v_src; >> + __m256i yl_ev, yl_od, yh_ev, yh_od; >> + __m256i u_ev, u_od, v_ev, v_od, temp; >> + >> + yl_ev = __lasx_xvldrepl_w(&t, 0); >> + yl_od = yl_ev; >> + yh_ev = yl_ev; >> + yh_od = yl_ev; >> + u_ev = yl_ev; >> + v_ev = yl_ev; >> + u_od = yl_ev; >> + v_od = yl_ev; >> + for (j = 0; j < lumFilterSize; j++) { >> + temp = __lasx_xvldrepl_h((lumFilter + j), 0); >> + DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + >> count_lum, >> + 32, l_src1, l_src2); >> + yl_ev = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1); >> + yl_od = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1); >> + yh_ev = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2); >> + yh_od = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2); >> + } >> + for (j = 0; j < chrFilterSize; j++) { >> + DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + >> count, 0, >> + u_src, v_src); >> + temp = __lasx_xvldrepl_h((chrFilter + j), 0); >> + u_ev = __lasx_xvmaddwev_w_h(u_ev, temp, u_src); >> + u_od = __lasx_xvmaddwod_w_h(u_od, temp, u_src); >> + v_ev = __lasx_xvmaddwev_w_h(v_ev, temp, v_src); >> + v_od = __lasx_xvmaddwod_w_h(v_od, temp, v_src); >> + } >> + yl_ev = __lasx_xvsrai_w(yl_ev, 19); >> + yh_ev = __lasx_xvsrai_w(yh_ev, 19); >> + yl_od = __lasx_xvsrai_w(yl_od, 19); >> + yh_od = __lasx_xvsrai_w(yh_od, 19); >> + u_ev = __lasx_xvsrai_w(u_ev, 19); >> + v_ev = __lasx_xvsrai_w(v_ev, 19); >> + u_od = __lasx_xvsrai_w(u_od, 19); >> + v_od = __lasx_xvsrai_w(v_od, 19); >> + u_ev = __lasx_xvadd_w(u_ev, headroom); >> + v_ev = __lasx_xvadd_w(v_ev, headroom); >> + u_od = __lasx_xvadd_w(u_od, headroom); >> + v_od = __lasx_xvadd_w(v_od, headroom); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3); >> + WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7); >> + WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7); >> + res -= 32; >> + } >> + if (res >= 16) { >> + int Y1, Y2, U, V; >> + int count_lum = count << 1; >> + __m256i l_src, u_src, v_src; >> + __m256i y_ev, y_od, u, v, temp; >> + >> + y_ev = __lasx_xvldrepl_w(&t, 0); >> + y_od = y_ev; >> + u = y_ev; >> + v = y_ev; >> + for (j = 0; j < lumFilterSize; j++) { >> + temp = __lasx_xvldrepl_h((lumFilter + j), 0); >> + l_src = __lasx_xvld(lumSrc[j] + count_lum, 0); >> + y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src); >> + y_od = __lasx_xvmaddwod_w_h(y_od, temp, l_src); >> + } >> + for (j = 0; j < chrFilterSize; j++) { >> + DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + >> count, >> + 0, u_src, v_src); >> + temp = __lasx_xvldrepl_h((chrFilter + j), 0); >> + u_src = __lasx_vext2xv_w_h(u_src); >> + v_src = __lasx_vext2xv_w_h(v_src); >> + u = __lasx_xvmaddwev_w_h(u, temp, u_src); >> + v = __lasx_xvmaddwev_w_h(v, temp, v_src); >> + } >> + y_ev = __lasx_xvsrai_w(y_ev, 19); >> + y_od = __lasx_xvsrai_w(y_od, 19); >> + u = __lasx_xvsrai_w(u, 19); >> + v = __lasx_xvsrai_w(v, 19); >> + u = __lasx_xvadd_w(u, headroom); >> + v = __lasx_xvadd_w(v, headroom); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6); >> + WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7); >> + res -= 16; >> + } >> + if (res >= 8) { >> + int Y1, Y2, U, V; >> + int count_lum = count << 1; >> + __m256i l_src, u_src, v_src; >> + __m256i y_ev, uv, temp; >> + >> + y_ev = __lasx_xvldrepl_w(&t, 0); >> + uv = y_ev; >> + for (j = 0; j < lumFilterSize; j++) { >> + temp = __lasx_xvldrepl_h((lumFilter + j), 0); >> + l_src = __lasx_xvld(lumSrc[j] + count_lum, 0); >> + l_src = __lasx_vext2xv_w_h(l_src); >> + y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src); >> + } >> + for (j = 0; j < chrFilterSize; j++) { >> + u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0); >> + v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0); >> + temp = __lasx_xvldrepl_h((chrFilter + j), 0); >> + u_src = __lasx_xvilvl_d(v_src, u_src); >> + u_src = __lasx_vext2xv_w_h(u_src); >> + uv = __lasx_xvmaddwev_w_h(uv, temp, u_src); >> + } >> + y_ev = __lasx_xvsrai_w(y_ev, 19); >> + uv = __lasx_xvsrai_w(uv, 19); >> + uv = __lasx_xvadd_w(uv, headroom); >> + WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4); >> + WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5); >> + WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6); >> + WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7); >> + } >> + for (; count < len_count; count++) { >> + int Y1 = 1 << 18; >> + int Y2 = Y1; >> + int U = Y1; >> + int V = Y1; >> + >> + for (j = 0; j < lumFilterSize; j++) { >> + Y1 += lumSrc[j][count * 2] * lumFilter[j]; >> + Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j]; >> + } >> + for (j = 0; j < chrFilterSize; j++) { >> + U += chrUSrc[j][count] * chrFilter[j]; >> + V += chrVSrc[j][count] * chrFilter[j]; >> + } >> + Y1 >>= 19; >> + Y2 >>= 19; >> + U >>= 19; >> + V >>= 19; >> + r = c->table_rV[V + YUVRGB_TABLE_HEADROOM]; >> + g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + >> + c->table_gV[V + YUVRGB_TABLE_HEADROOM]); >> + b = c->table_bU[U + YUVRGB_TABLE_HEADROOM]; >> + >> + yuv2rgb_write(dest, count, Y1, Y2, 0, 0, >> + r, g, b, y, target, 0); >> + } >> +} >> + >> +static void >> +yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2], >> + const int16_t *ubuf[2], const int16_t *vbuf[2], >> + const int16_t *abuf[2], uint8_t *dest, int dstW, >> + int yalpha, int uvalpha, int y, >> + enum AVPixelFormat target, int hasAlpha) >> +{ >> + const int16_t *buf0 = buf[0], *buf1 = buf[1], >> + *ubuf0 = ubuf[0], *ubuf1 = ubuf[1], >> + *vbuf0 = vbuf[0], *vbuf1 = vbuf[1]; >> + int yalpha1 = 4096 - yalpha; >> + int uvalpha1 = 4096 - uvalpha; >> + int i, count = 0; >> + int len = dstW - 15; >> + int len_count = (dstW + 1) >> 1; >> + const void *r, *g, *b; >> + int head = YUVRGB_TABLE_HEADROOM; >> + __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1); >> + __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1); >> + __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha); >> + __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha); >> + __m256i headroom = __lasx_xvreplgr2vr_w(head); >> + >> + for (i = 0; i < len; i += 16) { >> + int Y1, Y2, U, V; >> + int i_dex = i << 1; >> + int c_dex = count << 1; >> + __m256i y0_h, y0_l, y0, u0, v0; >> + __m256i y1_h, y1_l, y1, u1, v1; >> + __m256i y_l, y_h, u, v; >> + >> + DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex, >> + buf1, i_dex, y0, u0, v0, y1); >> + DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1); >> + DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l); >> + DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h); >> + DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1); >> + y0_l = __lasx_xvmul_w(y0_l, v_yalpha1); >> + y0_h = __lasx_xvmul_w(y0_h, v_yalpha1); >> + u0 = __lasx_xvmul_w(u0, v_uvalpha1); >> + v0 = __lasx_xvmul_w(v0, v_uvalpha1); >> + y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l); >> + y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h); >> + u = __lasx_xvmadd_w(u0, v_uvalpha, u1); >> + v = __lasx_xvmadd_w(v0, v_uvalpha, v1); >> + y_l = __lasx_xvsrai_w(y_l, 19); >> + y_h = __lasx_xvsrai_w(y_h, 19); >> + u = __lasx_xvsrai_w(u, 19); >> + v = __lasx_xvsrai_w(v, 19); >> + u = __lasx_xvadd_w(u, headroom); >> + v = __lasx_xvadd_w(v, headroom); >> + WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0); >> + WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1); >> + WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2); >> + WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3); >> + WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4); >> + WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5); >> + WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6); >> + WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7); >> + } >> + if (dstW - i >= 8) { >> + int Y1, Y2, U, V; >> + int i_dex = i << 1; >> + __m256i y0_l, y0, u0, v0; >> + __m256i y1_l, y1, u1, v1; >> + __m256i y_l, u, v; >> + >> + y0 = __lasx_xvldx(buf0, i_dex); > > 1. Not long ago, I tried to constify the src pointer of several asm > functions and noticed that they produced new warnings for loongarch > (according to patchwork: > https://patchwork.ffmpeg.org/project/ffmpeg/patch/db6pr0101mb2214178d3e6b8dca5b86f8198f...@db6pr0101mb2214.eurprd01.prod.exchangelabs.com/), > even though I was sure that the code is const-correct. After finding > (via https://github.com/opencv/opencv/pull/21833) a toolchain > (https://gitee.com/wenux/cross-compiler-la-on-x86) that can build the > lasx and lsx code (upstream GCC seems to be lacking lsx and lasx support > at the moment; at least, my self-compiled loongarch-GCC did not support > lsx and lasx) the issue was clear: lsxintrin.h and lasxintrin.h do not > use const at all, even for functions that only read data (I presume the > vl in __lsx_vldx stands for "vector load"?). > So I sent another iteration > https://ffmpeg.org/pipermail/ffmpeg-devel/2022-August/299562.html of > that patchset that now added wrappers for __lsx_vldx() and > __lasx_xvldx() and cc'ed you and some other developers from loongson to > alert you of the issue in the hope that you fix the headers, so that my > wrappers wouldn't need to be applied. That didn't work, as my mails > could not be delivered to you. So I applied the patchset. > 2. You use __lasx_xvldx() to read from a const int16_t. This will give > new warnings unless the above issue has been fixed. Has it? > 3. I don't know whether it has, as patchwork's fate tests don't work for > a few days already. Given that the mails I receive from patchwork when > it doesn't like a commit message arrive from "Patchwork > <yinshiyou...@loongson.cn>" I presume that loongson is now somehow > running patchwork, so you should be able to inform the right people to > fix it. > 4. If you fixed the const-issue, can you please make an updated > toolchain with lsx and lasx support enabled available to us? > > - Andreas >
Hi Andreas, Sorry for the late reply. This issue will be fixed by using const for v1 of __lsx_vldx, and I will update toolchain of LoongArch patchwork runner ASAP. Thank you very much for analyzing this problem and giving suggestion. Thanks, Shiyou _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".