Now that the xmm register and gpr count has decreased, it is possible to port to x86_32. To save on code, x86_32 with or without PIC is handled as if PIC. --- libavcodec/x86/hevc_mc.asm | 39 +++++++++++++++++++++++-------- libavcodec/x86/hevcdsp.h | 4 +++- libavcodec/x86/hevcdsp_init.c | 53 +++++++++++++++++++++++++++---------------- 3 files changed, 66 insertions(+), 30 deletions(-)
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index a127a4d..085a212 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -59,9 +59,6 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 %endmacro -EPEL_TABLE 8,16, b, avx2 -EPEL_TABLE 10, 8, w, avx2 - EPEL_TABLE 8, 8, b, sse4 EPEL_TABLE 10, 4, w, sse4 EPEL_TABLE 12, 4, w, sse4 @@ -85,17 +82,20 @@ QPEL_TABLE 8, 8, b, sse4 QPEL_TABLE 10, 4, w, sse4 QPEL_TABLE 12, 4, w, sse4 +%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL QPEL_TABLE 8,16, b, avx2 QPEL_TABLE 10, 8, w, avx2 +EPEL_TABLE 8,16, b, avx2 +EPEL_TABLE 10, 8, w, avx2 +%endif + %define MAX_PB_SIZE 64 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 -%if ARCH_X86_64 - %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 %if %1 <= 4 movq %3, [%2] ; load data from source2 @@ -139,7 +139,7 @@ QPEL_TABLE 10, 8, w, avx2 %macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp %if cpuflag(avx2) %assign %%offset 32 -%ifdef PIC +%if ARCH_X86_32 lea %5q, [hevc_epel_filters_avx2_%1] %define FILTER %5q %else @@ -147,7 +147,7 @@ QPEL_TABLE 10, 8, w, avx2 %endif %else %assign %%offset 16 -%ifdef PIC +%if ARCH_X86_32 lea %5q, [hevc_epel_filters_sse4_%1] %define FILTER %5q %else @@ -759,9 +759,19 @@ cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 8, dst, dststride, src, srcstride, jnz .loop ; height loop RET +%if ARCH_X86_32 +cglobal hevc_put_hevc_bi_epel_h%1_%2, 4, 7, 8, dst, dststride, src, srcstride, src2, height, mx + mov r4d, mxm + EPEL_FILTER %2, r4, m4, m5, r5 + mov r4d, r4m + mov r5d, r5m +%else cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 8, dst, dststride, src, srcstride, src2, height, mx, rfilter - movdqa m6, [pw_bi_%2] + movifnidn mxd, mxm EPEL_FILTER %2, mx, m4, m5, rfilter +%endif +%assign %%stride ((%2 + 7)/8) + movdqa m6, [pw_bi_%2] .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m4, m5, 1 @@ -811,11 +821,18 @@ cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, 8, dst, dststride, src, srcstride, RET +%if ARCH_X86_32 +cglobal hevc_put_hevc_bi_epel_v%1_%2, 5, 7, 8, dst, dststride, src, srcstride, src2, height, r3src, my + mov r5d, mym + EPEL_FILTER %2, r5, m4, m5, r3src + mov r5d, r5m +%else cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, 8, dst, dststride, src, srcstride, src2, height, r3src, my movifnidn myd, mym + EPEL_FILTER %2, my, m4, m5, r3src +%endif movdqa m6, [pw_bi_%2] sub srcq, srcstrideq - EPEL_FILTER %2, my, m4, m5, r3src lea r3srcq, [srcstrideq*3] .loop EPEL_LOAD %2, srcq, srcstride, %1 @@ -832,6 +849,7 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, 8, dst, dststride, src, srcstride, s %endmacro +%if ARCH_X86_64 ; ****************************** ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, ; uint8_t *_src, ptrdiff_t _srcstride, @@ -1571,7 +1589,9 @@ WEIGHTING_FUNCS 2, 12 WEIGHTING_FUNCS 4, 12 WEIGHTING_FUNCS 6, 12 WEIGHTING_FUNCS 8, 12 +%endif +INIT_XMM sse4 ; adds ff_ and _sse4 to function name HEVC_PUT_HEVC_PEL_PIXELS 2, 8 HEVC_PUT_HEVC_PEL_PIXELS 4, 8 HEVC_PUT_HEVC_PEL_PIXELS 6, 8 @@ -1607,6 +1627,7 @@ HEVC_PUT_HEVC_EPEL 4, 12 HEVC_PUT_HEVC_EPEL 6, 12 HEVC_PUT_HEVC_EPEL 8, 12 +%if ARCH_X86_64 HEVC_PUT_HEVC_EPEL_HV 2, 8 HEVC_PUT_HEVC_EPEL_HV 4, 8 HEVC_PUT_HEVC_EPEL_HV 6, 8 diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index ad8168f..4f83d7b 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -36,8 +36,10 @@ dst[idx1][idx2][idx3] = ff_hevc_put_hevc_ ## name ## _ ## D ## _##opt; \ dst ## _bi[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_ ## name ## _ ## D ## _##opt; \ dst ## _uni[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_ ## name ## _ ## D ## _##opt; \ +if (ARCH_X86_64) { \ dst ## _uni_w[idx1][idx2][idx3] = ff_hevc_put_hevc_uni_w_ ## name ## _ ## D ## _##opt; \ -dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt +dst ## _bi_w[idx1][idx2][idx3] = ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt; \ +} #define PEL_PROTOTYPE(name, D, opt) \ diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index e493033..b0abd27 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -163,7 +163,7 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \ mc_rep_bi_func2(name, bitd, step1, step2, W, opt) -#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL +#if HAVE_SSE4_EXTERNAL #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \ @@ -237,7 +237,7 @@ mc_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) -#if HAVE_AVX2_EXTERNAL +#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4); mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4); @@ -357,6 +357,8 @@ mc_rep_funcs(epel_v,12, 8, 32, sse4); mc_rep_funcs(epel_v,12, 8, 24, sse4); mc_rep_funcs(epel_v,12, 8, 16, sse4); mc_rep_funcs(epel_v,12, 4, 12, sse4); + +# if ARCH_X86_64 mc_rep_funcs(epel_hv, 8, 16, 64, sse4); mc_rep_funcs(epel_hv, 8, 16, 48, sse4); mc_rep_funcs(epel_hv, 8, 16, 32, sse4); @@ -618,7 +620,9 @@ mc_bi_w_func(epel_hv, 12, 6, sse4); mc_bi_w_funcs(qpel_h, 12, sse4); mc_bi_w_funcs(qpel_v, 12, sse4); mc_bi_w_funcs(qpel_hv, 12, sse4); -#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL +# endif // ~ARCH_X86_64 + +#endif //HAVE_SSE4_EXTERNAL #define SAO_BAND_FILTER_FUNCS(bitd, opt) \ void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \ @@ -731,17 +735,20 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) } SAO_EDGE_INIT(8, ssse3); } - if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { + if (EXTERNAL_SSE4(cpu_flags)) { EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4); EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4); EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); + if (ARCH_X86_64) { + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); + } } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx; @@ -880,16 +887,19 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; } - if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { + if (EXTERNAL_SSE4(cpu_flags)) { EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4); EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4); EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); + if (ARCH_X86_64) { + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); + } } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx; @@ -1087,12 +1097,15 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4); EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4); EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); + if (ARCH_X86_64) { + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); + } } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx; -- 1.9.2.msysgit.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel