Michael Niedermayr gave me the hint, that gcc might address memory operands through %esp, so modifying the stack pointer with push/pop is not gueranteed to work under all conditions.
One way would be, to backup the register to the stack manually, without modifiying %esp, but this might not be necessary. #if defined(PIC) && !defined(ARCH_X86_64) "push %%"REG_a" \n\t" "movl %2, %%"REG_a" \n\t" "movq (%%"REG_a"), %%mm5 \n\t" // qmat[0] "pop %%"REG_a" \n\t" #else "movq (%2), %%mm5 \n\t" // qmat[0] #endif "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] "mov $-128, %%"REG_a" \n\t" push/pop seem to be not necessary at all here, because REG_a gets loaded with $-128 and isn't used before, therefore no need to back it up. This can be reduced to (couldn't test this yet): #if defined(PIC) && !defined(ARCH_X86_64) "movl %2, %%"REG_a" \n\t" "movq (%%"REG_a"), %%mm5 \n\t" // qmat[0] #else "movq (%2), %%mm5 \n\t" // qmat[0] #endif "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] "mov $-128, %%"REG_a" \n\t" In my previous suggestion to fix the bug in the code, usage of REG_b and push/pop can be avoided this way (tested and working): #if defined(PIC) && !defined(ARCH_X86_64) "addl %2, %%"REG_a" \n\t" "movq (%%"REG_a"), %%mm5 \n\t" // qmat[i] "subl %2, %%"REG_a" \n\t" #else "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] #endif Instead of loading %2 to reg_b and adding reg_b to reg_a to refrence qmat[i], %2 can be added to reg_a directly and subtracted again from it afterwards. The complete diff to the upstream sources and to the previous debian package is attached. bye, Tobias
diff -Nur orig/libavcodec/i386/mpegvideo_mmx_template.c ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c --- orig/libavcodec/i386/mpegvideo_mmx_template.c 2005-07-17 10:39:07.000000000 +0200 +++ ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c 2005-07-17 12:14:50.000000000 +0200 @@ -96,10 +96,8 @@ "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 #if defined(PIC) && !defined(ARCH_X86_64) - "push %%"REG_a" \n\t" "movl %2, %%"REG_a" \n\t" "movq (%%"REG_a"), %%mm5 \n\t" // qmat[0] - "pop %%"REG_a" \n\t" #else "movq (%2), %%mm5 \n\t" // qmat[0] #endif @@ -169,12 +167,11 @@ "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] #if defined(PIC) && !defined(ARCH_X86_64) - "push %%"REG_a" \n\t" - "movl %2, %%"REG_a" \n\t" - "movq (%%"REG_a", %%"REG_a"), %%mm5 \n\t" // qmat[i] - "pop %%"REG_a" \n\t" + "addl %2, %%"REG_a" \n\t" + "movq (%%"REG_a"), %%mm5 \n\t" // qmat[i] + "subl %2, %%"REG_a" \n\t" #else - "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] + "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] #endif "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "por %%mm0, %%mm4 \n\t"
--- ffmpeg-0.cvs20050626.orig/libavcodec/i386/mpegvideo_mmx_template.c +++ ffmpeg-0.cvs20050626/libavcodec/i386/mpegvideo_mmx_template.c @@ -95,7 +95,12 @@ SPREADW(%%mm3) "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 +#if defined(PIC) && !defined(ARCH_X86_64) + "movl %2, %%"REG_a" \n\t" + "movq (%%"REG_a"), %%mm5 \n\t" // qmat[0] +#else "movq (%2), %%mm5 \n\t" // qmat[0] +#endif "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] "mov $-128, %%"REG_a" \n\t" @@ -128,7 +133,11 @@ "movd %%mm3, %%"REG_a" \n\t" "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) +#if defined(PIC) && !defined(ARCH_X86_64) + : "r" (block+64), "m" (qmat), "r" (bias), +#else : "r" (block+64), "r" (qmat), "r" (bias), +#endif "r" (inv_zigzag_direct16+64), "r" (temp_block+64) ); // note the asm is split cuz gcc doesnt like that many operands ... @@ -157,7 +166,13 @@ "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] - "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] +#if defined(PIC) && !defined(ARCH_X86_64) + "addl %2, %%"REG_a" \n\t" + "movq (%%"REG_a"), %%mm5 \n\t" // qmat[i] + "subl %2, %%"REG_a" \n\t" +#else + "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] +#endif "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "por %%mm0, %%mm4 \n\t" "pxor %%mm1, %%mm0 \n\t" @@ -179,7 +194,11 @@ "movd %%mm3, %%"REG_a" \n\t" "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) +#if defined(PIC) && !defined(ARCH_X86_64) + : "r" (block+64), "m" (qmat+64), "r" (bias+64), +#else : "r" (block+64), "r" (qmat+64), "r" (bias+64), +#endif "r" (inv_zigzag_direct16+64), "r" (temp_block+64) ); // note the asm is split cuz gcc doesnt like that many operands ...