> 2021年12月24日 下午5:49,Hao Chen <chen...@loongson.cn> 写道: > > From: Shiyou Yin <yinshiyou...@loongson.cn> > > ./ffmpeg -i 8_mpeg4_1080p_24fps_12Mbps.avi -f rawvideo -y /dev/null -an > before:376fps > after :433fps > > Change-Id: Ic8018562093154887323b508b81d0f489c0d265d > Signed-off-by: Hao Chen <chen...@loongson.cn> > --- > libavcodec/hpeldsp.c | 2 + > libavcodec/hpeldsp.h | 1 + > libavcodec/loongarch/Makefile | 2 + > libavcodec/loongarch/hpeldsp_init_loongarch.c | 50 + > libavcodec/loongarch/hpeldsp_lasx.c | 1289 +++++++++++++++++ > libavcodec/loongarch/hpeldsp_lasx.h | 58 + > 6 files changed, 1402 insertions(+) > create mode 100644 libavcodec/loongarch/hpeldsp_init_loongarch.c > create mode 100644 libavcodec/loongarch/hpeldsp_lasx.c > create mode 100644 libavcodec/loongarch/hpeldsp_lasx.h > > > diff --git a/libavcodec/loongarch/hpeldsp_lasx.c > b/libavcodec/loongarch/hpeldsp_lasx.c > new file mode 100644 > index 0000000000..07f0c4b517 > --- /dev/null > +++ b/libavcodec/loongarch/hpeldsp_lasx.c > @@ -0,0 +1,1289 @@ >
> +void ff_put_no_rnd_pixels16_y2_8_lasx(uint8_t *block, const uint8_t *pixels, > + ptrdiff_t line_size, int h) > +{ > + if (h == 16) { > + common_vt_bil_no_rnd_16x16_lasx(pixels, line_size, block, line_size); > + } else if (h == 8) { > + common_vt_bil_no_rnd_8x16_lasx(pixels, line_size, block, line_size); > + } > +} > + > +static void common_hv_bil_no_rnd_16x16_lasx(const uint8_t *src, > + int32_t src_stride, > + uint8_t *dst, int32_t dst_stride) > +{ > + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; > + __m256i src10, src11, src12, src13, src14, src15, src16, src17; > + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; > + int32_t src_stride_2x = src_stride << 1; > + int32_t src_stride_4x = src_stride << 2; > + int32_t src_stride_3x = src_stride_2x + src_stride; > + uint8_t* _src = (uint8_t*)src; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, > src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, > src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, > + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, > + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, > + src8, src9); > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, > 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, > src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, > src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, > src6, 0x02, > + src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, > src14, 0x02, > + src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, > src8, src9); > + > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, > 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; The last line is not needed. > +} > + > +static void common_hv_bil_no_rnd_8x16_lasx(const uint8_t *src, > + int32_t src_stride, > + uint8_t *dst, int32_t dst_stride) > +{ > + __m256i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; > + __m256i src10, src11, src12, src13, src14, src15, src16, src17; > + __m256i sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; > + int32_t src_stride_2x = src_stride << 1; > + int32_t src_stride_4x = src_stride << 2; > + int32_t src_stride_3x = src_stride_2x + src_stride; > + uint8_t* _src = (uint8_t*)src; > + > + src0 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src1, > src2); > + src3 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src4 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, src5, > src6); > + src7 = __lasx_xvldx(_src, src_stride_3x); > + _src += (1 - src_stride_4x); > + src9 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src10, src11); > + src12 = __lasx_xvldx(_src, src_stride_3x); > + _src += src_stride_4x; > + src13 = __lasx_xvld(_src, 0); > + DUP2_ARG2(__lasx_xvldx, _src, src_stride, _src, src_stride_2x, > + src14, src15); > + src16 = __lasx_xvldx(_src, src_stride_3x); > + _src += (src_stride_4x - 1); > + DUP2_ARG2(__lasx_xvld, _src, 0, _src, 1, src8, src17); > + > + DUP4_ARG3(__lasx_xvpermi_q, src0, src4, 0x02, src1, src5, 0x02, src2, > + src6, 0x02, src3, src7, 0x02, src0, src1, src2, src3); > + DUP4_ARG3(__lasx_xvpermi_q, src4, src8, 0x02, src9, src13, 0x02, src10, > + src14, 0x02, src11, src15, 0x02, src4, src5, src6, src7); > + DUP2_ARG3(__lasx_xvpermi_q, src12, src16, 0x02, src13, src17, 0x02, > src8, src9); > + > + DUP4_ARG2(__lasx_xvilvl_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum0, sum2, sum4, sum6); > + DUP4_ARG2(__lasx_xvilvh_h, src5, src0, src6, src1, src7, src2, src8, > src3, > + sum1, sum3, sum5, sum7); > + src8 = __lasx_xvilvl_h(src9, src4); > + src9 = __lasx_xvilvh_h(src9, src4); > + > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum0, sum0, sum1, sum1, sum2, sum2, > + sum3, sum3, src0, src1, src2, src3); > + DUP4_ARG2(__lasx_xvhaddw_hu_bu, sum4, sum4, sum5, sum5, sum6, sum6, > + sum7, sum7, src4, src5, src6, src7); > + DUP2_ARG2(__lasx_xvhaddw_hu_bu, src8, src8, src9, src9, src8, src9); > + > + DUP4_ARG2(__lasx_xvadd_h, src0, src2, src1, src3, src2, src4, src3, src5, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvadd_h, src4, src6, src5, src7, src6, src8, src7, src9, > + sum4, sum5, sum6, sum7); > + DUP4_ARG2(__lasx_xvaddi_hu, sum0, 1, sum1, 1, sum2, 1, sum3, 1, > + sum0, sum1, sum2, sum3); > + DUP4_ARG2(__lasx_xvaddi_hu, sum4, 1, sum5, 1, sum6, 1, sum7, 1, > + sum4, sum5, sum6, sum7); > + DUP4_ARG3(__lasx_xvsrani_b_h, sum1, sum0, 2, sum3, sum2, 2, sum5, sum4, > 2, > + sum7, sum6, 2, sum0, sum1, sum2, sum3); > + __lasx_xvstelm_d(sum0, dst, 0, 0); > + __lasx_xvstelm_d(sum0, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 0); > + __lasx_xvstelm_d(sum1, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 0); > + __lasx_xvstelm_d(sum2, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 0); > + __lasx_xvstelm_d(sum3, dst, 8, 1); > + dst += dst_stride; > + __lasx_xvstelm_d(sum0, dst, 0, 2); > + __lasx_xvstelm_d(sum0, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum1, dst, 0, 2); > + __lasx_xvstelm_d(sum1, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum2, dst, 0, 2); > + __lasx_xvstelm_d(sum2, dst, 8, 3); > + dst += dst_stride; > + __lasx_xvstelm_d(sum3, dst, 0, 2); > + __lasx_xvstelm_d(sum3, dst, 8, 3); > + dst += dst_stride; This line is not needed too. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".