Hi, the impact is relatively important (3 to 25 cycles, ie up to 2%) so I also include bench.patch if anybody wants to confirm the timings.
Although I thought openhevc's MC code was faster, the benchmarked functions are faster in ffmpeg. I didn't investigate if there are any MC functions much slower in ffmpeg or if there's something else occurring. -- Christophe
From 36dbc9b67269579e23345ec225ffa270d472b94e Mon Sep 17 00:00:00 2001 From: Christophe Gisquet <christophe.gisq...@gmail.com> Date: Thu, 24 Jul 2014 17:23:47 +0200 Subject: [PATCH 10/10] x86: hevc_mc: replace simple leas by adds lea is detrimental for those simple cases. No impact overall to the change though. Before: 15017 decicycles in q, 1016152 runs, 32424 skips 15382 decicycles in q_bi, 1013673 runs, 34903 skips 3713 decicycles in e, 2074534 runs, 22618 skips 3901 decicycles in e_bi, 2065509 runs, 31643 skips 7852 decicycles in q_uni, 520165 runs, 4123 skips 2398 decicycles in e_uni, 1043339 runs, 5237 skips After: 14898 decicycles in q, 1016295 runs, 32281 skips 15119 decicycles in q_bi, 1015392 runs, 33184 skips 3682 decicycles in e, 2073224 runs, 23928 skips 3720 decicycles in e_bi, 2065043 runs, 32109 skips 7643 decicycles in q_uni, 520280 runs, 4008 skips 2363 decicycles in e_uni, 1043780 runs, 4796 skips --- libavcodec/x86/hevc_mc.asm | 120 ++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 5cf37d0..30edc52 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -520,8 +520,8 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstri .loop SIMPLE_LOAD %1, %2, srcq, m0 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -535,8 +535,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstrid MC_PIXEL_COMPUTE %1, %2 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -573,8 +573,8 @@ cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, EPEL_COMPUTE %2, %1, m4, m5 UNI_COMPUTE %1, %2, m0, m1, m6 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -588,8 +588,8 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, s SIMPLE_BILOAD %1, src2q, m2, m3 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -623,8 +623,8 @@ cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, EPEL_COMPUTE %2, %1, m4, m5 UNI_COMPUTE %1, %2, m0, m1, m6 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -641,8 +641,8 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, SIMPLE_BILOAD %1, src2q, m2, m3 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -664,15 +664,15 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, h EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m4, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m5, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m6, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 @@ -698,15 +698,15 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m4, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m5, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m6, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 @@ -723,8 +723,8 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid movdqa m4, m5 movdqa m5, m6 movdqa m6, m7 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -737,15 +737,15 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m4, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m5, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 SWAP m6, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 @@ -763,8 +763,8 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride movdqa m4, m5 movdqa m5, m6 movdqa m6, m7 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -801,8 +801,8 @@ cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride %endif UNI_COMPUTE %1, %2, m0, m1, m9 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -819,8 +819,8 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, SIMPLE_BILOAD %1, src2q, m10, m11 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -858,8 +858,8 @@ cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstrid %endif UNI_COMPUTE %1, %2, m0, m1, m9 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -877,8 +877,8 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride %endif BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -900,31 +900,31 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, h QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m8, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m9, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m10, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m11, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m12, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m13, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m14, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw @@ -969,31 +969,31 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m8, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m9, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m10, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m11, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m12, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m13, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m14, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw @@ -1029,8 +1029,8 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid movdqa m13, m14 movdqa m14, m15 %endif - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET @@ -1043,31 +1043,31 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m8, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m9, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m10, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m11, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m12, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m13, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw SWAP m14, m0 - lea srcq, [srcq + srcstrideq] + add srcq, srcstrideq .loop QPEL_H_LOAD %2, srcq, %1, 15 QPEL_HV_COMPUTE %1, %2, mx, ackssdw @@ -1104,8 +1104,8 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride movdqa m13, m14 movdqa m14, m15 %endif - lea dstq, [dstq+dststrideq] ; dst += dststride - lea srcq, [srcq+srcstrideq] ; src += srcstride + add dstq, dststrideq ; dst += dststride + add srcq, srcstrideq ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -1158,7 +1158,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh pminsw m0, [max_pixels_%2] %endif PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride + add dstq, dststrideq ; dst += dststride lea srcq, [srcq+2*srcstrideq] ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop @@ -1211,7 +1211,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, pminsw m0, [max_pixels_%2] %endif PEL_%2STORE%1 dstq, m0, m1 - lea dstq, [dstq+dststrideq] ; dst += dststride + add dstq, dststrideq ; dst += dststride lea srcq, [srcq+2*srcstrideq] ; src += srcstride lea src2q, [src2q+2*src2strideq] ; src2 += srcstride dec r6d ; cmp height -- 1.9.2.msysgit.0
diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index 913385a..881562c 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -1339,13 +1339,18 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, srcstride = edge_emu_stride; } - if (!weight_flag) + if (!weight_flag) { + START_TIMER s->hevcdsp.put_hevc_qpel_uni[idx][!!my][!!mx](dst, dststride, src, srcstride, block_h, mx, my, block_w); - else + STOP_TIMER("q_uni") + } else { + START_TIMER s->hevcdsp.put_hevc_qpel_uni_w[idx][!!my][!!mx](dst, dststride, src, srcstride, block_h, s->sh.luma_log2_weight_denom, luma_weight, luma_offset, mx, my, block_w); + STOP_TIMER("q_uni_w") + } } /** @@ -1423,12 +1428,20 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, src1stride = edge_emu_stride; } + { + START_TIMER s->hevcdsp.put_hevc_qpel[idx][!!my0][!!mx0](tmp, MAX_PB_SIZE, src0, src0stride, block_h, mx0, my0, block_w); - if (!weight_flag) + STOP_TIMER("q") + } + if (!weight_flag) { + START_TIMER s->hevcdsp.put_hevc_qpel_bi[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, tmp, MAX_PB_SIZE, block_h, mx1, my1, block_w); - else + STOP_TIMER("q_bi") + } + else { + START_TIMER s->hevcdsp.put_hevc_qpel_bi_w[idx][!!my1][!!mx1](dst, dststride, src1, src1stride, tmp, MAX_PB_SIZE, block_h, s->sh.luma_log2_weight_denom, s->sh.luma_weight_l0[current_mv->ref_idx[0]], @@ -1436,7 +1449,8 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, s->sh.luma_offset_l0[current_mv->ref_idx[0]], s->sh.luma_offset_l1[current_mv->ref_idx[1]], mx1, my1, block_w); - + STOP_TIMER("q_bi_w") + } } /** @@ -1495,13 +1509,19 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, src0 = lc->edge_emu_buffer + buf_offset0; srcstride = edge_emu_stride; } - if (!weight_flag) + if (!weight_flag) { + START_TIMER s->hevcdsp.put_hevc_epel_uni[idx][!!my][!!mx](dst0, dststride, src0, srcstride, block_h, _mx, _my, block_w); - else + STOP_TIMER("e_uni") + } + else { + START_TIMER s->hevcdsp.put_hevc_epel_uni_w[idx][!!my][!!mx](dst0, dststride, src0, srcstride, block_h, s->sh.chroma_log2_weight_denom, chroma_weight, chroma_offset, _mx, _my, block_w); + STOP_TIMER("e_uni_w") + } } /** @@ -1595,13 +1615,21 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF src2stride = edge_emu_stride; } + { + START_TIMER s->hevcdsp.put_hevc_epel[idx][!!my0][!!mx0](tmp, tmpstride, src1, src1stride, block_h, _mx0, _my0, block_w); - if (!weight_flag) + STOP_TIMER("e") + } + if (!weight_flag) { + START_TIMER s->hevcdsp.put_hevc_epel_bi[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1], src2, src2stride, tmp, tmpstride, block_h, _mx1, _my1, block_w); - else + STOP_TIMER("e_bi") + } + else { + START_TIMER s->hevcdsp.put_hevc_epel_bi_w[idx][!!my1][!!mx1](dst0, s->frame->linesize[cidx+1], src2, src2stride, tmp, tmpstride, block_h, @@ -1611,6 +1639,8 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF s->sh.chroma_offset_l0[current_mv->ref_idx[0]][cidx], s->sh.chroma_offset_l1[current_mv->ref_idx[1]][cidx], _mx1, _my1, block_w); + STOP_TIMER("e_bi_w") + } } static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel