Michael Niedermayr gave me the hint, that gcc might address memory
operands through %esp, so modifying the stack pointer with push/pop
is not gueranteed to work under all conditions.

One way would be, to backup the register to the stack manually, without
modifiying %esp, but this might not be necessary.

#if defined(PIC) && !defined(ARCH_X86_64)
            "push %%"REG_a"                     \n\t"
            "movl %2, %%"REG_a"                 \n\t"
            "movq (%%"REG_a"), %%mm5            \n\t" // qmat[0]
            "pop %%"REG_a"                      \n\t"
#else
            "movq (%2), %%mm5                   \n\t" // qmat[0]
#endif
            "pxor %%mm6, %%mm6                  \n\t"
            "psubw (%3), %%mm6                  \n\t" // -bias[0]
            "mov $-128, %%"REG_a"               \n\t"

push/pop seem to be not necessary at all here, because REG_a gets
loaded with $-128 and isn't used before, therefore no need to back
it up. This can be reduced to (couldn't test this yet):

#if defined(PIC) && !defined(ARCH_X86_64)
            "movl %2, %%"REG_a"                 \n\t"
            "movq (%%"REG_a"), %%mm5            \n\t" // qmat[0]
#else
            "movq (%2), %%mm5                   \n\t" // qmat[0]
#endif
            "pxor %%mm6, %%mm6                  \n\t"
            "psubw (%3), %%mm6                  \n\t" // -bias[0]
            "mov $-128, %%"REG_a"               \n\t"

In my previous suggestion to fix the bug in the code, usage of REG_b 
and push/pop can be avoided this way (tested and working):

#if defined(PIC) && !defined(ARCH_X86_64)
            "addl %2, %%"REG_a"                 \n\t" 
            "movq (%%"REG_a"), %%mm5            \n\t" // qmat[i]
            "subl %2, %%"REG_a"                 \n\t" 
#else
            "movq (%2, %%"REG_a"), %%mm5        \n\t" // qmat[i]
#endif

Instead of loading %2 to reg_b and adding reg_b to reg_a to
refrence qmat[i], %2 can be added to reg_a directly and subtracted
again from it afterwards.

The complete diff to the upstream sources and to the previous debian
package is attached.

bye,

Tobias

diff -Nur orig/libavcodec/i386/mpegvideo_mmx_template.c ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c
--- orig/libavcodec/i386/mpegvideo_mmx_template.c	2005-07-17 10:39:07.000000000 +0200
+++ ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c	2005-07-17 12:14:50.000000000 +0200
@@ -96,10 +96,8 @@
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
 #if defined(PIC) && !defined(ARCH_X86_64)
-            "push %%"REG_a"			\n\t"
             "movl %2, %%"REG_a"			\n\t"
             "movq (%%"REG_a"), %%mm5		\n\t" // qmat[0]
-            "pop %%"REG_a"			\n\t"
 #else
             "movq (%2), %%mm5			\n\t" // qmat[0]
 #endif
@@ -169,12 +167,11 @@
             "movq (%3, %%"REG_a"), %%mm6	\n\t" // bias[0]
             "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
 #if defined(PIC) && !defined(ARCH_X86_64)
-            "push %%"REG_a"			\n\t"
-            "movl %2, %%"REG_a"			\n\t"
-            "movq (%%"REG_a", %%"REG_a"), %%mm5	\n\t" // qmat[i]
-            "pop %%"REG_a"			\n\t"
+            "addl %2, %%"REG_a" 	        \n\t" 
+            "movq (%%"REG_a"), %%mm5		\n\t" // qmat[i]
+            "subl %2, %%"REG_a" 	        \n\t" 
 #else
-            "movq (%2, %%"REG_a"), %%mm5		\n\t" // qmat[i]
+            "movq (%2, %%"REG_a"), %%mm5	\n\t" // qmat[i]
 #endif
             "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por %%mm0, %%mm4			\n\t" 
--- ffmpeg-0.cvs20050626.orig/libavcodec/i386/mpegvideo_mmx_template.c
+++ ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c
@@ -95,7 +95,12 @@
             SPREADW(%%mm3)
             "pxor %%mm7, %%mm7			\n\t" // 0
             "pxor %%mm4, %%mm4			\n\t" // 0
+#if defined(PIC) && !defined(ARCH_X86_64)
+            "movl %2, %%"REG_a"			\n\t"
+            "movq (%%"REG_a"), %%mm5		\n\t" // qmat[0]
+#else
             "movq (%2), %%mm5			\n\t" // qmat[0]
+#endif
             "pxor %%mm6, %%mm6			\n\t"
             "psubw (%3), %%mm6			\n\t" // -bias[0]
             "mov $-128, %%"REG_a"		\n\t"
@@ -128,7 +133,11 @@
             "movd %%mm3, %%"REG_a"		\n\t"
             "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
+#if defined(PIC) && !defined(ARCH_X86_64)
+            : "r" (block+64), "m" (qmat), "r" (bias),
+#else
             : "r" (block+64), "r" (qmat), "r" (bias),
+#endif
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
         );
         // note the asm is split cuz gcc doesnt like that many operands ...
@@ -157,7 +166,13 @@
             "psubw %%mm1, %%mm0			\n\t" // ABS(block[i])
             "movq (%3, %%"REG_a"), %%mm6	\n\t" // bias[0]
             "paddusw %%mm6, %%mm0		\n\t" // ABS(block[i]) + bias[0]
-            "movq (%2, %%"REG_a"), %%mm5		\n\t" // qmat[i]
+#if defined(PIC) && !defined(ARCH_X86_64)
+            "addl %2, %%"REG_a" 	        \n\t" 
+            "movq (%%"REG_a"), %%mm5		\n\t" // qmat[i]
+            "subl %2, %%"REG_a" 	        \n\t" 
+#else
+            "movq (%2, %%"REG_a"), %%mm5	\n\t" // qmat[i]
+#endif
             "pmulhw %%mm5, %%mm0		\n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por %%mm0, %%mm4			\n\t" 
             "pxor %%mm1, %%mm0			\n\t" 
@@ -179,7 +194,11 @@
             "movd %%mm3, %%"REG_a"		\n\t"
             "movzb %%al, %%"REG_a"		\n\t" // last_non_zero_p1
 	    : "+a" (last_non_zero_p1)
+#if defined(PIC) && !defined(ARCH_X86_64)
+            : "r" (block+64), "m" (qmat+64), "r" (bias+64),
+#else
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+#endif
               "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
         );
         // note the asm is split cuz gcc doesnt like that many operands ...

Reply via email to