From: Shivraj Patil <shivraj.pa...@imgtec.com> This patch adds MSA (MIPS-SIMD-Arch) optimizations for VP9 intra functions in new file vp9_intra_msa.c
Signed-off-by: Shivraj Patil <shivraj.pa...@imgtec.com> --- libavcodec/mips/Makefile | 3 +- libavcodec/mips/vp9_intra_msa.c | 880 +++++++++++++++++++++++++++++++++++++ libavcodec/mips/vp9dsp_init_mips.c | 23 + libavcodec/mips/vp9dsp_mips.h | 73 +++ 4 files changed, 978 insertions(+), 1 deletion(-) create mode 100644 libavcodec/mips/vp9_intra_msa.c diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 7dca55d..b71d2c4 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -44,7 +44,8 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \ mips/hevcpred_msa.o MSA-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_msa.o \ mips/vp9_lpf_msa.o \ - mips/vp9_idct_msa.o + mips/vp9_idct_msa.o \ + mips/vp9_intra_msa.o MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o \ mips/h264idct_msa.o MSA-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_msa.o diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c new file mode 100644 index 0000000..e29e727 --- /dev/null +++ b/libavcodec/mips/vp9_intra_msa.c @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2015 Shivraj Patil (shivraj.pa...@imgtec.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/vp9dsp.h" +#include "libavutil/mips/generic_macros_msa.h" +#include "vp9dsp_mips.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ +{ \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ +} + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) +{ + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t out0, out1, out2, out3; + + out0 = src[0 * src_stride] * 0x01010101; + out1 = src[1 * src_stride] * 0x01010101; + out2 = src[2 * src_stride] * 0x01010101; + out3 = src[3 * src_stride] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0 * src_stride] * 0x0101010101010101; + out1 = src[1 * src_stride] * 0x0101010101010101; + out2 = src[2 * src_stride] * 0x0101010101010101; + out3 = src[3 * src_stride] * 0x0101010101010101; + out4 = src[4 * src_stride] * 0x0101010101010101; + out5 = src[5 * src_stride] * 0x0101010101010101; + out6 = src[6 * src_stride] * 0x0101010101010101; + out7 = src[7 * src_stride] * 0x0101010101010101; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + src += src_stride; + inp1 = src[0]; + src += src_stride; + inp2 = src[0]; + src += src_stride; + inp3 = src[0]; + src += src_stride; + + src0 = (v16u8) __msa_fill_b(inp0); + src1 = (v16u8) __msa_fill_b(inp1); + src2 = (v16u8) __msa_fill_b(inp2); + src3 = (v16u8) __msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint32_t row; + uint8_t inp0, inp1; + v16u8 src0, src1; + + for (row = 16; row--;) { + inp0 = src[0]; + src += src_stride; + inp1 = src[0]; + src += src_stride; + + src0 = (v16u8) __msa_fill_b(inp0); + src1 = (v16u8) __msa_fill_b(inp1); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t out, addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum; + + if (is_left && is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum = __msa_hadd_u_w(sum_above, sum_above); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 4; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 4) >> 3; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 4; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 2) >> 2; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum = __msa_hadd_u_w(sum_above, sum_above); + sum = (v4u32) __msa_srari_w((v4i32) sum, 2); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + + out = __msa_copy_u_w((v4i32) store, 0); + + for (row = 4; row--;) { + SW(out, dst); + dst += dst_stride; + } +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t out, addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 8; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 8; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 4) >> 3; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64) __msa_srari_d((v2i64) sum, 3); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + + out = __msa_copy_u_w((v4i32) store, 0); + + for (row = 8; row--;) { + SW(out, dst); + SW(out, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 16; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 16) >> 5; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 16; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 8) >> 4; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above = LD_UB(src_top); + + sum_above = __msa_hadd_u_h(src_above, src_above); + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64) __msa_srari_d((v2i64) sum, 4); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + + for (row = 16; row--;) { + ST_UB(store, dst); + dst += dst_stride; + } +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + int32_t src_stride_left, + uint8_t *dst, int32_t dst_stride, + uint8_t is_above, uint8_t is_left) +{ + uint32_t row; + uint32_t addition = 0; + v16u8 src_above1, src_above2, store; + v8u16 sum_above1, sum_above2, sum_above; + v4u32 sum_top; + v2u64 sum; + + if (is_left && is_above) { + src_above1 = LD_UB(src_top); + src_above2 = LD_UB(src_top + 16); + + HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2); + + sum_above = sum_above1 + sum_above2; + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + addition = __msa_copy_u_w((v4i32) sum, 0); + + for (row = 0; row < 32; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 32) >> 6; + store = (v16u8) __msa_fill_b(addition); + } else if (is_left) { + for (row = 0; row < 32; row++) { + addition += src_left[row * src_stride_left]; + } + + addition = (addition + 16) >> 5; + store = (v16u8) __msa_fill_b(addition); + } else if (is_above) { + src_above1 = LD_UB(src_top); + src_above2 = LD_UB(src_top + 16); + + HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2); + + sum_above = sum_above1 + sum_above2; + sum_top = __msa_hadd_u_w(sum_above, sum_above); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum); + sum = __msa_hadd_u_d(sum_top, sum_top); + sum = (v2u64) __msa_srari_d((v2i64) sum, 5); + store = (v16u8) __msa_splati_b((v16i8) sum, 0); + } else { + store = (v16u8) __msa_ldi_b(128); + } + for (row = 32; row--;) { + ST_UB2(store, store, dst, 16); + dst += dst_stride; + } +} + +#define INTRA_PREDICT_VALDC_4X4_MSA(val) \ +static void intra_predict_##val##dc_4x4_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row, out; \ + v16i8 store; \ + \ + store = __msa_ldi_b(val); \ + out = __msa_copy_u_w((v4i32) store, 0); \ + \ + for (row = 4; row--;) \ + { \ + SW(out, dst); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_4X4_MSA(127); +INTRA_PREDICT_VALDC_4X4_MSA(129); + +#define INTRA_PREDICT_VALDC_8X8_MSA(val) \ +static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row, out; \ + v16i8 store; \ + \ + store = __msa_ldi_b(val); \ + out = __msa_copy_u_w((v4i32) store, 0); \ + \ + for (row = 8; row--;) \ + { \ + SW(out, dst); \ + SW(out, (dst + 4)); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_8X8_MSA(127); +INTRA_PREDICT_VALDC_8X8_MSA(129); + +#define INTRA_PREDICT_VALDC_16X16_MSA(val) \ +static void intra_predict_##val##dc_16x16_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row; \ + v16u8 store; \ + \ + store = (v16u8) __msa_ldi_b(val); \ + \ + for (row = 16; row--;) \ + { \ + ST_UB(store, dst); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_16X16_MSA(127); +INTRA_PREDICT_VALDC_16X16_MSA(129); + +#define INTRA_PREDICT_VALDC_32X32_MSA(val) \ +static void intra_predict_##val##dc_32x32_msa(uint8_t *dst, \ + int32_t dst_stride) \ +{ \ + uint32_t row; \ + v16u8 store; \ + \ + store = (v16u8) __msa_ldi_b(val); \ + \ + for (row = 32; row--;) \ + { \ + ST_UB2(store, store, dst, 16); \ + dst += dst_stride; \ + } \ +} + +INTRA_PREDICT_VALDC_32X32_MSA(127); +INTRA_PREDICT_VALDC_32X32_MSA(129); + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + int32_t src_left_stride, + uint8_t *dst, int32_t dst_stride) +{ + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16) __msa_fill_h(top_left); + src_top = LD_SB(src_top_ptr); + + src_left0 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left1 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left2 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left3 = __msa_fill_b(src_left[0]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + int32_t src_left_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16) __msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left1 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left2 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left3 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + int32_t src_left_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16) __msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left1 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left2 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left3 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + int32_t src_left_stride, + uint8_t *dst, + int32_t dst_stride) +{ + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + src_top0 = LD_SB(src_top); + src_top1 = LD_SB(src_top + 16); + src_top_left = (v8u16) __msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left1 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left2 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + src_left3 = __msa_fill_b(src_left[0]); + src_left += src_left_stride; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, + res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void ff_vert_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_vert_4x4_msa(top, dst, stride); +} + +void ff_vert_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_vert_8x8_msa(top, dst, stride); +} + +void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_vert_16x16_msa(top, dst, stride); +} + +void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_vert_32x32_msa(top, dst, stride); +} + +void ff_hor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_horiz_4x4_msa(left + 3, -1, dst, stride); +} + +void ff_hor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_horiz_8x8_msa(left + 7, -1, dst, stride); +} + +void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_horiz_16x16_msa(left + 15, -1, dst, stride); +} + +void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_horiz_32x32_msa(left + 31, -1, dst, stride); +} + +void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 1, 1); +} + +void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 1, 1); +} + +void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 1, 1); +} + +void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 1, 1); +} + +void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 0, 1); +} + +void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 0, 1); +} + +void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 0, 1); +} + +void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 0, 1); +} + +void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 1, 0); +} + +void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 1, 0); +} + +void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 1, 0); +} + +void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 1, 0); +} + +void ff_dc_128_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 0, 0); +} + +void ff_dc_128_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 0, 0); +} + +void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 0, 0); +} + +void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 0, 0); +} + +void ff_dc_127_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_127dc_4x4_msa(dst, stride); +} + +void ff_dc_127_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_127dc_8x8_msa(dst, stride); +} + +void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_127dc_16x16_msa(dst, stride); +} + +void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_127dc_32x32_msa(dst, stride); +} + +void ff_dc_129_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_129dc_4x4_msa(dst, stride); +} + +void ff_dc_129_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top) +{ + intra_predict_129dc_8x8_msa(dst, stride); +} + +void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_129dc_16x16_msa(dst, stride); +} + +void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_129dc_32x32_msa(dst, stride); +} + +void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_tm_4x4_msa(top, left + 3, -1, dst, stride); +} + +void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_tm_8x8_msa(top, left + 7, -1, dst, stride); +} + +void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_tm_16x16_msa(top, left + 15, -1, dst, stride); +} + +void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top) +{ + intra_predict_tm_32x32_msa(top, left + 31, -1, dst, stride); +} diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c index 79cafb8..6c03601 100644 --- a/libavcodec/mips/vp9dsp_init_mips.c +++ b/libavcodec/mips/vp9dsp_init_mips.c @@ -24,6 +24,28 @@ #include "vp9dsp_mips.h" #if HAVE_MSA +static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp) +{ + if (bpp == 8) { +#define init_intra_pred_msa(tx, sz) \ + dsp->intra_pred[tx][VERT_PRED] = ff_vert_##sz##_msa; \ + dsp->intra_pred[tx][HOR_PRED] = ff_hor_##sz##_msa; \ + dsp->intra_pred[tx][DC_PRED] = ff_dc_##sz##_msa; \ + dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa; \ + dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_msa; \ + dsp->intra_pred[tx][DC_128_PRED] = ff_dc_128_##sz##_msa; \ + dsp->intra_pred[tx][DC_127_PRED] = ff_dc_127_##sz##_msa; \ + dsp->intra_pred[tx][DC_129_PRED] = ff_dc_129_##sz##_msa; \ + dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_msa; \ + + init_intra_pred_msa(TX_4X4, 4x4); + init_intra_pred_msa(TX_8X8, 8x8); + init_intra_pred_msa(TX_16X16, 16x16); + init_intra_pred_msa(TX_32X32, 32x32); +#undef init_intra_pred_msa + } +} + static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp) { if (bpp == 8) { @@ -129,6 +151,7 @@ static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp) static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp) { + vp9dsp_intrapred_init_msa(dsp, bpp); vp9dsp_itxfm_init_msa(dsp, bpp); vp9dsp_mc_init_msa(dsp, bpp); vp9dsp_loopfilter_init_msa(dsp, bpp); diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h index 9f59d45..1df8fa9 100644 --- a/libavcodec/mips/vp9dsp_mips.h +++ b/libavcodec/mips/vp9dsp_mips.h @@ -150,4 +150,77 @@ void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride, void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); +void ff_vert_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_vert_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_128_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_128_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_127_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_127_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_129_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_129_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, const uint8_t *top); +void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); +void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, + const uint8_t *top); + #endif // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H -- 2.3.7 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel