Use the macros of ADD8 to replace continuous addition operations. --- libavcodec/mips/vp3dsp_idct_msa.c | 80 ++++++++----------------------------- libavutil/mips/generic_macros_msa.h | 6 +++ 2 files changed, 22 insertions(+), 64 deletions(-)
diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c index 90c578f..e4cd377 100644 --- a/libavcodec/mips/vp3dsp_idct_msa.c +++ b/libavcodec/mips/vp3dsp_idct_msa.c @@ -178,14 +178,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) c0, c1, c2, c3); ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, c4, c5, c6, c7); - A += c0; - B += c7; - C += c1; - D += c2; - E += c3; - F += c4; - G += c5; - H += c6; + ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6, + A, B, C, D, E, F, G, H); } CLIP_SW8_0_255(A, B, C, D, E, F, G, H); sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r); @@ -208,14 +202,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) Gd = Bdd; Hd = Bdd; } else { - Ad = Add + c0; - Bd = Add + c1; - Cd = Add + c2; - Dd = Add + c3; - Ed = Add + c4; - Fd = Add + c5; - Gd = Add + c6; - Hd = Add + c7; + ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6, + Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); } Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); @@ -235,14 +223,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); - r0_r = Ad + A; - r1_r = Bd + C; - r2_r = Cd + D; - r3_r = Dd + E; - r0_l = Ed + F; - r1_l = Fd + G; - r2_l = Gd + H; - r3_l = Hd + B; + ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B, + r0_r, r1_r, r2_r, r3_r, r0_l, r1_l, r2_l, r3_l); /* Row 4 to 7 */ TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r, @@ -286,14 +268,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) c0, c1, c2, c3); ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7, c4, c5, c6, c7); - A += c0; - B += c7; - C += c1; - D += c2; - E += c3; - F += c4; - G += c5; - H += c6; + ADD8(A, c0, B, c7, C, c1, D, c2, E, c3, F, c4, G, c5, H, c6, + A, B, C, D, E, F, G, H); } CLIP_SW8_0_255(A, B, C, D, E, F, G, H); sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r); @@ -316,14 +292,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) Gd = Bdd; Hd = Bdd; } else { - Ad = Add + c0; - Bd = Add + c1; - Cd = Add + c2; - Dd = Add + c3; - Ed = Add + c4; - Fd = Add + c5; - Gd = Add + c6; - Hd = Add + c7; + ADD8(Add, c0, Add, c1, Add, c2, Add, c3, Add, c4, Add, c5, Add, c6, + Add, c7, Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd); } Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t); @@ -343,14 +313,8 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type) F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t); G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t); H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t); - r4_r = Ad + A; - r5_r = Bd + C; - r6_r = Cd + D; - r7_r = Dd + E; - r4_l = Ed + F; - r5_l = Fd + G; - r6_l = Gd + H; - r7_l = Hd + B; + ADD8(Ad, A, Bd, C, Cd, D, Dd, E, Ed, F, Fd, G, Gd, H, Hd, B, + r4_r, r5_r, r6_r, r7_r, r4_l, r5_l, r6_l, r7_l); VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1); VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3); VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5); @@ -400,14 +364,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) e0, e1, e2, e3); ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, e4, e5, e6, e7); - e0 += dc; - e1 += dc; - e2 += dc; - e3 += dc; - e4 += dc; - e5 += dc; - e6 += dc; - e7 += dc; + ADD8(e0, dc, e1, dc, e2, dc, e3, dc, e4, dc, e5, dc, e6, dc, e7, dc, + e0, e1, e2, e3, e4, e5, e6, e7); CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7); /* Left part */ @@ -415,14 +373,8 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block) r0, r1, r2, r3); ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7, r4, r5, r6, r7); - r0 += dc; - r1 += dc; - r2 += dc; - r3 += dc; - r4 += dc; - r5 += dc; - r6 += dc; - r7 += dc; + ADD8(r0, dc, r1, dc, r2, dc, r3, dc, r4, dc, r5, dc, r6, dc, r7, dc, + r0, r1, r2, r3, r4, r5, r6, r7); CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7); VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1); VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3); diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h index c085d58..3d892ce 100644 --- a/libavutil/mips/generic_macros_msa.h +++ b/libavutil/mips/generic_macros_msa.h @@ -2153,6 +2153,12 @@ ADD2(in0, in1, in2, in3, out0, out1); \ ADD2(in4, in5, in6, in7, out2, out3); \ } +#define ADD8(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, \ + in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7) \ +{ \ + ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3); \ + ADD4(in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, out6, out7); \ +} /* Description : Subtraction of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 -- 2.1.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".