The mmxext implementation is slower than the C version. rgb24toyv12_16_200_c: 14812.6 ( 1.00x) rgb24toyv12_16_200_mmxext: 17400.4 ( 0.85x) rgb24toyv12_128_60_c: 35616.9 ( 1.00x) rgb24toyv12_128_60_mmxext: 39610.4 ( 0.90x) rgb24toyv12_512_16_c: 37209.4 ( 1.00x) rgb24toyv12_512_16_mmxext: 41136.2 ( 0.90x) rgb24toyv12_1920_4_c: 34737.4 ( 1.00x) rgb24toyv12_1920_4_mmxext: 34818.9 ( 1.00x) rgb24toyv12_1920_4_negstride_c: 34855.2 ( 1.00x) rgb24toyv12_1920_4_negstride_mmxext: 34773.7 ( 1.00x) --- libswscale/x86/rgb2rgb.c | 207 --------------------------------------- 1 file changed, 207 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 4d6ba9ff21..e27aea7b83 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -1473,210 +1473,6 @@ static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidt :::"memory"); } -/** - * Height should be a multiple of 2 and width should be a multiple of 2. - * (If this is a problem for anyone then tell me, and I will fix it.) - * Chrominance data is only taken from every second line, - * others are ignored in the C version. - * FIXME: Write HQ version. - */ -#if HAVE_7REGS -static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, - int width, int height, - int lumStride, int chromStride, int srcStride, - int32_t *rgb2yuv) -{ -#define BGR2Y_IDX "16*4+16*32" -#define BGR2U_IDX "16*4+16*33" -#define BGR2V_IDX "16*4+16*34" - int y; - const x86_reg chromWidth= width>>1; - - if (height > 2) { - ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); - src += 2*srcStride; - ydst += 2*lumStride; - udst += chromStride; - vdst += chromStride; - height -= 2; - } - - for (y = 0; y < height - 2; y += 2) { - for (int i = 0; i < 2; i++) { - __asm__ volatile( - "mov %2, %%"FF_REG_a"\n\t" - "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"FF_REG_d") \n\t" - "movd (%0, %%"FF_REG_d"), %%mm0 \n\t" - "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" - "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm1, %%mm0 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "psraw $7, %%mm0 \n\t" - - "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" - "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" - "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm1 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "pmaddwd %%mm6, %%mm3 \n\t" - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm1, %%mm4 \n\t" - "packssdw %%mm3, %%mm2 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm2 \n\t" - "add $24, %%"FF_REG_d"\n\t" - "packssdw %%mm2, %%mm4 \n\t" - "psraw $7, %%mm4 \n\t" - - "packuswb %%mm4, %%mm0 \n\t" - "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" - - MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" - "add $8, %%"FF_REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) - NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) - : "%"FF_REG_a, "%"FF_REG_d - ); - ydst += lumStride; - src += srcStride; - } - src -= srcStride*2; - __asm__ volatile( - "mov %4, %%"FF_REG_a"\n\t" - "movq "MANGLE(ff_w1111)", %%mm5 \n\t" - "movq "BGR2U_IDX"(%5), %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" - "add %%"FF_REG_d", %%"FF_REG_d"\n\t" - ".p2align 4 \n\t" - "1: \n\t" - PREFETCH" 64(%0, %%"FF_REG_d") \n\t" - PREFETCH" 64(%1, %%"FF_REG_d") \n\t" - "movq (%0, %%"FF_REG_d"), %%mm0 \n\t" - "movq (%1, %%"FF_REG_d"), %%mm1 \n\t" - "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" - "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm0 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "movq "BGR2V_IDX"(%5), %%mm1 \n\t" - "movq "BGR2V_IDX"(%5), %%mm3 \n\t" - - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm0 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "psrad $8, %%mm0 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm2, %%mm0 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm0 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 - "psraw $7, %%mm0 \n\t" - - "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" - "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" - "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" - "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm4, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "psrlq $24, %%mm4 \n\t" - "psrlq $24, %%mm2 \n\t" - PAVGB" %%mm1, %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "movq "BGR2V_IDX"(%5), %%mm1 \n\t" - "movq "BGR2V_IDX"(%5), %%mm3 \n\t" - - "pmaddwd %%mm4, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm6, %%mm4 \n\t" - "pmaddwd %%mm6, %%mm2 \n\t" - "psrad $8, %%mm4 \n\t" - "psrad $8, %%mm1 \n\t" - "psrad $8, %%mm2 \n\t" - "psrad $8, %%mm3 \n\t" - "packssdw %%mm2, %%mm4 \n\t" - "packssdw %%mm3, %%mm1 \n\t" - "pmaddwd %%mm5, %%mm4 \n\t" - "pmaddwd %%mm5, %%mm1 \n\t" - "add $24, %%"FF_REG_d"\n\t" - "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 - "psraw $7, %%mm4 \n\t" - - "movq %%mm0, %%mm1 \n\t" - "punpckldq %%mm4, %%mm0 \n\t" - "punpckhdq %%mm4, %%mm1 \n\t" - "packsswb %%mm1, %%mm0 \n\t" - "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" - "movd %%mm0, (%2, %%"FF_REG_a") \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, (%3, %%"FF_REG_a") \n\t" - "add $4, %%"FF_REG_a" \n\t" - " js 1b \n\t" - : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) - NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) - : "%"FF_REG_a, "%"FF_REG_d - ); - - udst += chromStride; - vdst += chromStride; - src += srcStride*2; - } - - __asm__ volatile(EMMS" \n\t" - SFENCE" \n\t" - :::"memory"); - - ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); -} -#endif /* HAVE_7REGS */ - static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, @@ -2257,9 +2053,6 @@ static av_cold void rgb2rgb_init_mmxext(void) yuyvtoyuv422 = yuyvtoyuv422_mmxext; planar2x = planar2x_mmxext; -#if HAVE_7REGS - ff_rgb24toyv12 = rgb24toyv12_mmxext; -#endif /* HAVE_7REGS */ yuyvtoyuv420 = yuyvtoyuv420_mmxext; uyvytoyuv420 = uyvytoyuv420_mmxext; -- 2.30.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".