[FFmpeg-devel] [PATCH v3] avcodec/mips: [loongson] optimize memset in h264dsp.

Shiyou Yin Fri, 31 Aug 2018 07:05:07 -0700

Optimized memset with mmi in following functions:
1. ff_h264_add_pixels4_8_mmi.
2. ff_h264_idct_add_8_mmi.
3. ff_h264_idct8_add_8_mmi.


This optimization improved h264 decoding performance about 1.3%(tested on 
loongson 3A3000).
---
 libavcodec/mips/h264dsp_mmi.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index ac6fa99..ac65a20 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -31,7 +31,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, 
int stride)
 {
     double ftmp[9];
     DECLARE_VAR_LOW32;
-    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
@@ -59,12 +58,16 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, 
int stride)
         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+
+        /* memset(src, 0, 32); */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[src])            \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[src])            \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           RESTRICT_ASM_LOW32
-          RESTRICT_ASM_ALL64
           [ftmp8]"=&f"(ftmp[8])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
@@ -72,7 +75,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, 
int stride)
         : "memory"
     );
 
-    memset(src, 0, 32);
 }
 
 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -80,7 +82,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int 
stride)
     double ftmp[12];
     uint64_t tmp[1];
     DECLARE_VAR_LOW32;
-    DECLARE_VAR_ALL64;
     DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
@@ -152,6 +153,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
         MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+
+        /* memset(block, 0, 32) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -159,7 +165,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           RESTRICT_ASM_LOW32
-          RESTRICT_ASM_ALL64
           RESTRICT_ASM_ADDRT
           [tmp0]"=&r"(tmp[0])
         : [dst]"r"(dst),                    [block]"r"(block),
@@ -167,7 +172,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
         : "memory"
     );
 
-    memset(block, 0, 32);
 }
 
 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -176,7 +180,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
     uint64_t tmp[7];
     mips_reg addr[1];
     DECLARE_VAR_LOW32;
-    DECLARE_VAR_ALL64;
     DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
@@ -617,6 +620,17 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
         MMI_SWC1(%[ftmp6], %[addr0], 0x00)
         MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
         PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
+
+        /* memset(block, 0, 128) */
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x00(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x10(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x20(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x30(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x40(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x50(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x60(%[block])          \n\t"
+        "gssqc1     %[ftmp0],   %[ftmp0],       0x70(%[block])          \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -630,7 +644,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
           [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
           [tmp6]"=&r"(tmp[6]),
           RESTRICT_ASM_LOW32
-          RESTRICT_ASM_ALL64
           RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [dst]"r"(dst),                    [block]"r"(block),
@@ -638,7 +651,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, 
int stride)
         : "$29","memory"
     );
 
-    memset(block, 0, 128);
 }
 
 void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
-- 
2.1.0

Hi Michael, this patch speed up about 1.3%, commit message has been updated.
I uploaded three patchs this week, They all passed the fate test and md5 
test(compare md5sum value  of decode result before and after optimization), 
could you please help to review and merge?

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH v3] avcodec/mips: [loongson] optimize memset in h264dsp.

Reply via email to