> -----Original Message----- > From: ffmpeg-devel <ffmpeg-devel-boun...@ffmpeg.org> On Behalf Of Frank > Plowman > Sent: 2024年11月29日 6:21 > To: ffmpeg-devel@ffmpeg.org > Subject: Re: [FFmpeg-devel] [PATCH v2] lavc/x86/videodsp: Drop MMX usage > > Ping
Pushed, thanks. > > On 16/11/2024 17:59, Frank Plowman wrote: > > Remove the MMX versions of these functions and modify the SSE > > implementations to avoid using MMX registers. > > > > Signed-off-by: Frank Plowman <p...@frankplowman.com> > > --- > > libavcodec/x86/videodsp.asm | 61 ++++++----------------- > > libavcodec/x86/videodsp_init.c | 88 +++++++++++++++++----------------- > > 2 files changed, 59 insertions(+), 90 deletions(-) > > > > diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm > > index 3cc07878d3..81ae2ec10c 100644 > > --- a/libavcodec/x86/videodsp.asm > > +++ b/libavcodec/x86/videodsp.asm > > @@ -123,54 +123,43 @@ hvar_fn > > ; - if (%2 & 8) fills 8 bytes into xmm$next > > ; - if (%2 & 4) fills 4 bytes into xmm$next > > ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax > > -; on mmx, - fills mm0-7 for consecutive sets of 8 pixels > > -; - if (%2 & 4) fills 4 bytes into mm$next > > -; - if (%2 & 3) fills 1, 2 or 4 bytes in eax > > ; writing data out is in the same way > > %macro READ_NUM_BYTES 2 > > %assign %%off 0 ; offset in source buffer > > -%assign %%mmx_idx 0 ; mmx register index > > %assign %%xmm_idx 0 ; xmm register index > > > > %rep %2/mmsize > > -%if mmsize == 16 > > movu xmm %+ %%xmm_idx, [srcq+%%off] > > %assign %%xmm_idx %%xmm_idx+1 > > -%else ; mmx > > - movu mm %+ %%mmx_idx, [srcq+%%off] > > -%assign %%mmx_idx %%mmx_idx+1 > > -%endif > > %assign %%off %%off+mmsize > > %endrep ; %2/mmsize > > > > -%if mmsize == 16 > > %if (%2-%%off) >= 8 > > %if %2 > 16 && (%2-%%off) > 8 > > movu xmm %+ %%xmm_idx, [srcq+%2-16] > > %assign %%xmm_idx %%xmm_idx+1 > > %assign %%off %2 > > %else > > - movq mm %+ %%mmx_idx, [srcq+%%off] > > -%assign %%mmx_idx %%mmx_idx+1 > > + movq xmm %+ %%xmm_idx, [srcq+%%off] > > +%assign %%xmm_idx %%xmm_idx+1 > > %assign %%off %%off+8 > > %endif > > %endif ; (%2-%%off) >= 8 > > -%endif > > > > %if (%2-%%off) >= 4 > > %if %2 > 8 && (%2-%%off) > 4 > > - movq mm %+ %%mmx_idx, [srcq+%2-8] > > + movq xmm %+ %%xmm_idx, [srcq+%2-8] > > %assign %%off %2 > > %else > > - movd mm %+ %%mmx_idx, [srcq+%%off] > > + movd xmm %+ %%xmm_idx, [srcq+%%off] > > %assign %%off %%off+4 > > %endif > > -%assign %%mmx_idx %%mmx_idx+1 > > +%assign %%xmm_idx %%xmm_idx+1 > > %endif ; (%2-%%off) >= 4 > > > > %if (%2-%%off) >= 1 > > %if %2 >= 4 > > - movd mm %+ %%mmx_idx, [srcq+%2-4] > > + movd xmm %+ %%xmm_idx, [srcq+%2-4] > > %elif (%2-%%off) == 1 > > mov valb, [srcq+%2-1] > > %elif (%2-%%off) == 2 > > @@ -185,48 +174,40 @@ hvar_fn > > > > %macro WRITE_NUM_BYTES 2 > > %assign %%off 0 ; offset in destination buffer > > -%assign %%mmx_idx 0 ; mmx register index > > %assign %%xmm_idx 0 ; xmm register index > > > > %rep %2/mmsize > > -%if mmsize == 16 > > movu [dstq+%%off], xmm %+ %%xmm_idx > > %assign %%xmm_idx %%xmm_idx+1 > > -%else ; mmx > > - movu [dstq+%%off], mm %+ %%mmx_idx > > -%assign %%mmx_idx %%mmx_idx+1 > > -%endif > > %assign %%off %%off+mmsize > > %endrep ; %2/mmsize > > > > -%if mmsize == 16 > > %if (%2-%%off) >= 8 > > %if %2 > 16 && (%2-%%off) > 8 > > movu [dstq+%2-16], xmm %+ %%xmm_idx > > %assign %%xmm_idx %%xmm_idx+1 > > %assign %%off %2 > > %else > > - movq [dstq+%%off], mm %+ %%mmx_idx > > -%assign %%mmx_idx %%mmx_idx+1 > > + movq [dstq+%%off], xmm %+ %%xmm_idx > > +%assign %%xmm_idx %%xmm_idx+1 > > %assign %%off %%off+8 > > %endif > > %endif ; (%2-%%off) >= 8 > > -%endif > > > > %if (%2-%%off) >= 4 > > %if %2 > 8 && (%2-%%off) > 4 > > - movq [dstq+%2-8], mm %+ %%mmx_idx > > + movq [dstq+%2-8], xmm %+ %%xmm_idx > > %assign %%off %2 > > %else > > - movd [dstq+%%off], mm %+ %%mmx_idx > > + movd [dstq+%%off], xmm %+ %%xmm_idx > > %assign %%off %%off+4 > > %endif > > -%assign %%mmx_idx %%mmx_idx+1 > > +%assign %%xmm_idx %%xmm_idx+1 > > %endif ; (%2-%%off) >= 4 > > > > %if (%2-%%off) >= 1 > > %if %2 >= 4 > > - movd [dstq+%2-4], mm %+ %%mmx_idx > > + movd [dstq+%2-4], xmm %+ %%xmm_idx > > %elif (%2-%%off) == 1 > > mov [dstq+%2-1], valb > > %elif (%2-%%off) == 2 > > @@ -318,11 +299,8 @@ cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, > > start_y, end_y, bh > > %endrep ; 1+%2-%1 > > %endmacro ; VERTICAL_EXTEND > > > > -INIT_MMX mmx > > -VERTICAL_EXTEND 1, 15 > > - > > -INIT_XMM sse > > -VERTICAL_EXTEND 16, 22 > > +INIT_XMM sse2 > > +VERTICAL_EXTEND 1, 22 > > > > ; left/right (horizontal) fast extend functions > > ; these are essentially identical to the vertical extend ones above, > > @@ -337,11 +315,7 @@ VERTICAL_EXTEND 16, 22 > > imul vald, 0x01010101 > > %if %1 >= 8 > > movd m0, vald > > -%if mmsize == 16 > > pshufd m0, m0, q0000 > > -%else > > - punpckldq m0, m0 > > -%endif ; mmsize == 16 > > %endif ; %1 > 16 > > %endif ; avx2 > > %endmacro ; READ_V_PIXEL > > @@ -356,7 +330,6 @@ VERTICAL_EXTEND 16, 22 > > %assign %%off %%off+mmsize > > %endrep ; %1/mmsize > > > > -%if mmsize == 16 > > %if %1-%%off >= 8 > > %if %1 > 16 && %1-%%off > 8 > > movu [%2+%1-16], m0 > > @@ -366,7 +339,6 @@ VERTICAL_EXTEND 16, 22 > > %assign %%off %%off+8 > > %endif > > %endif ; %1-%%off >= 8 > > -%endif ; mmsize == 16 > > > > %if %1-%%off >= 4 > > %if %1 > 8 && %1-%%off > 4 > > @@ -415,11 +387,8 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, > > dst_stride, start_x, bh, val > > %endrep ; 1+(%2-%1)/2 > > %endmacro ; H_EXTEND > > > > -INIT_MMX mmx > > -H_EXTEND 2, 14 > > - > > INIT_XMM sse2 > > -H_EXTEND 16, 22 > > +H_EXTEND 2, 22 > > > > %if HAVE_AVX2_EXTERNAL > > INIT_XMM avx2 > > diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c > > index ae9db95624..602856de1e 100644 > > --- a/libavcodec/x86/videodsp_init.c > > +++ b/libavcodec/x86/videodsp_init.c > > @@ -37,37 +37,37 @@ typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg > > dst_stride, > > x86_reg start_y, x86_reg end_y, x86_reg bh, > > x86_reg w); > > > > -extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; > > -extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; > > -extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; > > -static emu_edge_vfix_func * const vfixtbl_sse[22] = { > > - ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, > > - ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, > > - ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, > > - ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, > > - ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, > > - ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, > > - ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, > > - ff_emu_edge_vfix22_sse > > +extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2; > > +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2; > > +static emu_edge_vfix_func * const vfixtbl_sse2[22] = { > > + ff_emu_edge_vfix1_sse2, ff_emu_edge_vfix2_sse2, > > ff_emu_edge_vfix3_sse2, > > + ff_emu_edge_vfix4_sse2, ff_emu_edge_vfix5_sse2, > > ff_emu_edge_vfix6_sse2, > > + ff_emu_edge_vfix7_sse2, ff_emu_edge_vfix8_sse2, > > ff_emu_edge_vfix9_sse2, > > + ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, > > ff_emu_edge_vfix12_sse2, > > + ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, > > ff_emu_edge_vfix15_sse2, > > + ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, > > ff_emu_edge_vfix18_sse2, > > + ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, > > ff_emu_edge_vfix21_sse2, > > + ff_emu_edge_vfix22_sse2 > > }; > > extern emu_edge_vvar_func ff_emu_edge_vvar_sse; > > > > @@ -76,21 +76,21 @@ typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg > > dst_stride, > > typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, > > x86_reg start_x, x86_reg n_words, x86_reg > > bh); > > > > -extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; > > -extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; > > +extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2; > > +extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2; > > extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; > > extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; > > extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; > > extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; > > static emu_edge_hfix_func * const hfixtbl_sse2[11] = { > > - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, > > - ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, > > - ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, > > ff_emu_edge_hfix18_sse2, > > + ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, > > ff_emu_edge_hfix6_sse2, > > + ff_emu_edge_hfix8_sse2, ff_emu_edge_hfix10_sse2, > > ff_emu_edge_hfix12_sse2, > > + ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, > > ff_emu_edge_hfix18_sse2, > > ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 > > }; > > extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; > > @@ -104,7 +104,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; > > extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; > > extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; > > static emu_edge_hfix_func * const hfixtbl_avx2[11] = { > > - ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, > > + ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, > > ff_emu_edge_hfix6_sse2, > > ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, > > ff_emu_edge_hfix12_avx2, > > ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, > > ff_emu_edge_hfix18_avx2, > > ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 > > @@ -196,7 +196,7 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t > > *buf, const uint8_t *src, > > int h) > > { > > emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, > > - src_x, src_y, w, h, vfixtbl_sse, > > &ff_emu_edge_vvar_sse, > > + src_x, src_y, w, h, vfixtbl_sse2, > > &ff_emu_edge_vvar_sse, > > hfixtbl_sse2, &ff_emu_edge_hvar_sse2); > > } > > > > @@ -209,7 +209,7 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t > > *buf, const uint8_t *src, > > int h) > > { > > emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, > > - src_x, src_y, w, h, vfixtbl_sse, > > &ff_emu_edge_vvar_sse, > > + src_x, src_y, w, h, vfixtbl_sse2, > > &ff_emu_edge_vvar_sse, > > hfixtbl_avx2, &ff_emu_edge_hvar_avx2); > > } > > #endif /* HAVE_AVX2_EXTERNAL */ > > -- > Frank > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".