On 15/11/2024 20:55, Andreas Rheinhardt wrote: > Frank Plowman: >> Remove the MMX versions of these functions and modify the SSE >> implementations to avoid using MMX registers. >> >> Signed-off-by: Frank Plowman <p...@frankplowman.com> >> --- >> This wasn't wholly straightforward as the existing SSE implementation did >> not only use SSE but rather a blend of SSE and MMX. I would appreciate >> some review, as I am not particularly familiar with x86 assembly. Of >> particular interest are the changes to {READ,WRITE}_NUM_BYTES. The new >> implementation does not make economic use of the XMM registers like the >> old one did, but it still does not exhaust them -- was this more of a >> concern when this was originally written? >> --- >> libavcodec/x86/videodsp.asm | 63 ++++++---------------- >> libavcodec/x86/videodsp_init.c | 99 +++++++++++++++++----------------- >> 2 files changed, 65 insertions(+), 97 deletions(-) >> >> diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm >> index 3cc07878d3..20dcd3e2b9 100644 >> --- a/libavcodec/x86/videodsp.asm >> +++ b/libavcodec/x86/videodsp.asm >> @@ -123,54 +123,43 @@ hvar_fn >> ; - if (%2 & 8) fills 8 bytes into xmm$next >> ; - if (%2 & 4) fills 4 bytes into xmm$next >> ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax >> -; on mmx, - fills mm0-7 for consecutive sets of 8 pixels >> -; - if (%2 & 4) fills 4 bytes into mm$next >> -; - if (%2 & 3) fills 1, 2 or 4 bytes in eax >> ; writing data out is in the same way >> %macro READ_NUM_BYTES 2 >> %assign %%off 0 ; offset in source buffer >> -%assign %%mmx_idx 0 ; mmx register index >> %assign %%xmm_idx 0 ; xmm register index >> >> %rep %2/mmsize >> -%if mmsize == 16 >> movu xmm %+ %%xmm_idx, [srcq+%%off] >> %assign %%xmm_idx %%xmm_idx+1 >> -%else ; mmx >> - movu mm %+ %%mmx_idx, [srcq+%%off] >> -%assign %%mmx_idx %%mmx_idx+1 >> -%endif >> %assign %%off %%off+mmsize >> %endrep ; %2/mmsize >> >> -%if mmsize == 16 >> %if (%2-%%off) >= 8 >> %if %2 > 16 && (%2-%%off) > 8 >> movu xmm %+ %%xmm_idx, [srcq+%2-16] >> %assign %%xmm_idx %%xmm_idx+1 >> %assign %%off %2 >> %else >> - movq mm %+ %%mmx_idx, [srcq+%%off] >> -%assign %%mmx_idx %%mmx_idx+1 >> + movq xmm %+ %%xmm_idx, [srcq+%%off] >> +%assign %%xmm_idx %%xmm_idx+1 >> %assign %%off %%off+8 >> %endif >> %endif ; (%2-%%off) >= 8 >> -%endif >> >> %if (%2-%%off) >= 4 >> %if %2 > 8 && (%2-%%off) > 4 >> - movq mm %+ %%mmx_idx, [srcq+%2-8] >> + movq xmm %+ %%xmm_idx, [srcq+%2-8] >> %assign %%off %2 >> %else >> - movd mm %+ %%mmx_idx, [srcq+%%off] >> + movd xmm %+ %%xmm_idx, [srcq+%%off] >> %assign %%off %%off+4 >> %endif >> -%assign %%mmx_idx %%mmx_idx+1 >> +%assign %%xmm_idx %%xmm_idx+1 >> %endif ; (%2-%%off) >= 4 >> >> %if (%2-%%off) >= 1 >> %if %2 >= 4 >> - movd mm %+ %%mmx_idx, [srcq+%2-4] >> + movd xmm %+ %%xmm_idx, [srcq+%2-4] >> %elif (%2-%%off) == 1 >> mov valb, [srcq+%2-1] >> %elif (%2-%%off) == 2 >> @@ -185,48 +174,40 @@ hvar_fn >> >> %macro WRITE_NUM_BYTES 2 >> %assign %%off 0 ; offset in destination buffer >> -%assign %%mmx_idx 0 ; mmx register index >> %assign %%xmm_idx 0 ; xmm register index >> >> %rep %2/mmsize >> -%if mmsize == 16 >> movu [dstq+%%off], xmm %+ %%xmm_idx >> %assign %%xmm_idx %%xmm_idx+1 >> -%else ; mmx >> - movu [dstq+%%off], mm %+ %%mmx_idx >> -%assign %%mmx_idx %%mmx_idx+1 >> -%endif >> %assign %%off %%off+mmsize >> %endrep ; %2/mmsize >> >> -%if mmsize == 16 >> %if (%2-%%off) >= 8 >> %if %2 > 16 && (%2-%%off) > 8 >> movu [dstq+%2-16], xmm %+ %%xmm_idx >> %assign %%xmm_idx %%xmm_idx+1 >> %assign %%off %2 >> %else >> - movq [dstq+%%off], mm %+ %%mmx_idx >> -%assign %%mmx_idx %%mmx_idx+1 >> + movq [dstq+%%off], xmm %+ %%xmm_idx >> +%assign %%xmm_idx %%xmm_idx+1 >> %assign %%off %%off+8 >> %endif >> %endif ; (%2-%%off) >= 8 >> -%endif >> >> %if (%2-%%off) >= 4 >> %if %2 > 8 && (%2-%%off) > 4 >> - movq [dstq+%2-8], mm %+ %%mmx_idx >> + movq [dstq+%2-8], xmm %+ %%xmm_idx >> %assign %%off %2 >> %else >> - movd [dstq+%%off], mm %+ %%mmx_idx >> + movd [dstq+%%off], xmm %+ %%xmm_idx >> %assign %%off %%off+4 >> %endif >> -%assign %%mmx_idx %%mmx_idx+1 >> +%assign %%xmm_idx %%xmm_idx+1 >> %endif ; (%2-%%off) >= 4 >> >> %if (%2-%%off) >= 1 >> %if %2 >= 4 >> - movd [dstq+%2-4], mm %+ %%mmx_idx >> + movd [dstq+%2-4], xmm %+ %%xmm_idx >> %elif (%2-%%off) == 1 >> mov [dstq+%2-1], valb >> %elif (%2-%%off) == 2 >> @@ -318,11 +299,8 @@ cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, >> start_y, end_y, bh >> %endrep ; 1+%2-%1 >> %endmacro ; VERTICAL_EXTEND >> >> -INIT_MMX mmx >> -VERTICAL_EXTEND 1, 15 >> - >> -INIT_XMM sse >> -VERTICAL_EXTEND 16, 22 >> +INIT_XMM sse2 >> +VERTICAL_EXTEND 1, 22 >> >> ; left/right (horizontal) fast extend functions >> ; these are essentially identical to the vertical extend ones above, >> @@ -337,11 +315,7 @@ VERTICAL_EXTEND 16, 22 >> imul vald, 0x01010101 >> %if %1 >= 8 >> movd m0, vald >> -%if mmsize == 16 >> pshufd m0, m0, q0000 >> -%else >> - punpckldq m0, m0 >> -%endif ; mmsize == 16 >> %endif ; %1 > 16 >> %endif ; avx2 >> %endmacro ; READ_V_PIXEL >> @@ -356,7 +330,6 @@ VERTICAL_EXTEND 16, 22 >> %assign %%off %%off+mmsize >> %endrep ; %1/mmsize >> >> -%if mmsize == 16 >> %if %1-%%off >= 8 >> %if %1 > 16 && %1-%%off > 8 >> movu [%2+%1-16], m0 >> @@ -366,7 +339,6 @@ VERTICAL_EXTEND 16, 22 >> %assign %%off %%off+8 >> %endif >> %endif ; %1-%%off >= 8 >> -%endif ; mmsize == 16 >> >> %if %1-%%off >= 4 >> %if %1 > 8 && %1-%%off > 4 >> @@ -415,18 +387,15 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, >> dst_stride, start_x, bh, val >> %endrep ; 1+(%2-%1)/2 >> %endmacro ; H_EXTEND >> >> -INIT_MMX mmx >> -H_EXTEND 2, 14 >> - >> INIT_XMM sse2 >> -H_EXTEND 16, 22 >> +H_EXTEND 2, 22 >> >> %if HAVE_AVX2_EXTERNAL >> INIT_XMM avx2 >> H_EXTEND 8, 22 >> %endif >> >> -INIT_MMX mmxext >> +INIT_XMM sse2 >> cglobal prefetch, 3, 3, 0, buf, stride, h >> .loop: >> prefetcht0 [bufq] >> diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c >> index ae9db95624..376eec9c23 100644 >> --- a/libavcodec/x86/videodsp_init.c >> +++ b/libavcodec/x86/videodsp_init.c >> @@ -37,37 +37,37 @@ typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg >> dst_stride, >> x86_reg start_y, x86_reg end_y, x86_reg bh, >> x86_reg w); >> >> -extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; >> -extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; >> -extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; >> -static emu_edge_vfix_func * const vfixtbl_sse[22] = { >> - ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, >> - ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, >> - ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, >> - ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, >> - ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, >> - ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, >> - ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, >> - ff_emu_edge_vfix22_sse >> +extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2; >> +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2; >> +static emu_edge_vfix_func * const vfixtbl_sse2[22] = { >> + ff_emu_edge_vfix1_sse2, ff_emu_edge_vfix2_sse2, >> ff_emu_edge_vfix3_sse2, >> + ff_emu_edge_vfix4_sse2, ff_emu_edge_vfix5_sse2, >> ff_emu_edge_vfix6_sse2, >> + ff_emu_edge_vfix7_sse2, ff_emu_edge_vfix8_sse2, >> ff_emu_edge_vfix9_sse2, >> + ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, >> ff_emu_edge_vfix12_sse2, >> + ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, >> ff_emu_edge_vfix15_sse2, >> + ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, >> ff_emu_edge_vfix18_sse2, >> + ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, >> ff_emu_edge_vfix21_sse2, >> + ff_emu_edge_vfix22_sse2 >> }; >> extern emu_edge_vvar_func ff_emu_edge_vvar_sse; >> >> @@ -76,21 +76,21 @@ typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg >> dst_stride, >> typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, >> x86_reg start_x, x86_reg n_words, x86_reg >> bh); >> >> -extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; >> -extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; >> +extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2; >> +extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2; >> extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; >> extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; >> extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; >> extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; >> static emu_edge_hfix_func * const hfixtbl_sse2[11] = { >> - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, >> - ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, >> - ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, >> ff_emu_edge_hfix18_sse2, >> + ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, >> ff_emu_edge_hfix6_sse2, >> + ff_emu_edge_hfix8_sse2, ff_emu_edge_hfix10_sse2, >> ff_emu_edge_hfix12_sse2, >> + ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, >> ff_emu_edge_hfix18_sse2, >> ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 >> }; >> extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; >> @@ -104,7 +104,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; >> extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; >> extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; >> static emu_edge_hfix_func * const hfixtbl_avx2[11] = { >> - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, >> + ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, >> ff_emu_edge_hfix6_sse2, >> ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, >> ff_emu_edge_hfix12_avx2, >> ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, >> ff_emu_edge_hfix18_avx2, >> ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 >> @@ -196,7 +196,7 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t >> *buf, const uint8_t *src, >> int h) >> { >> emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, >> - src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, >> + src_x, src_y, w, h, vfixtbl_sse2, >> &ff_emu_edge_vvar_sse, >> hfixtbl_sse2, &ff_emu_edge_hvar_sse2); >> } >> >> @@ -209,24 +209,23 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t >> *buf, const uint8_t *src, >> int h) >> { >> emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, >> - src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, >> + src_x, src_y, w, h, vfixtbl_sse2, >> &ff_emu_edge_vvar_sse, >> hfixtbl_avx2, &ff_emu_edge_hvar_avx2); >> } >> #endif /* HAVE_AVX2_EXTERNAL */ >> #endif /* HAVE_X86ASM */ >> >> -void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h); >> +void ff_prefetch_sse2(const uint8_t *buf, ptrdiff_t stride, int h); >> >> av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) >> { >> #if HAVE_X86ASM >> int cpu_flags = av_get_cpu_flags(); >> >> - if (EXTERNAL_MMXEXT(cpu_flags)) { >> - ctx->prefetch = ff_prefetch_mmxext; >> - } >> - if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { >> - ctx->emulated_edge_mc = emulated_edge_mc_sse2; >> + if (EXTERNAL_SSE2(cpu_flags)) { >> + ctx->prefetch = ff_prefetch_sse2; >> + if (bpc <= 8) >> + ctx->emulated_edge_mc = emulated_edge_mc_sse2; >> } >> #if HAVE_AVX2_EXTERNAL >> if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { > > Why did you change prefetch to declare an SSE2 requirement? > > - Andreas
No good reason. I've reverted the changes to prefetch in v2. -- Frank _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".