On Mon, Jun 19, 2006 at 06:28:40PM +0200, Loïc Minier wrote:
>  From the gcc man page, -mpreferred-stack-boundary flag:
>    To ensure proper alignment of this values on the stack, the stack
>    boundary must be as aligned as that required by any value stored on
>    the stack.  
[...]
>  None of liboil, banshee, or even mono seem to be built with
>  -mpreferred-stack-boundary, yet I can imagine some of this software has
>  misaligned the stack.  Is there a way to find out which and add stack
>  alignment code before external function calls?

After some discussion with Loïc on IRC, I've implemented something
like this, it works for us both. So I'm pretty sure it works fine on
i386.

Where I'm sure it doesn't work is amd64, since I don't have access
to an amd64 machine. Michael Ablassmeier reported it works there
though.

The idea is to add a little wrapper function around the sse
functions to make sure the stack is aligned before they are called.
Would be nice if gcc could do this, without its support some asm
magic does work though.


I've attached the patch. It would be nice if someone who knows amd64
assembly could review this, and add proper amd64 support via ifdefs
if needed.

Cheers,
Christian Aichinger
--- liboil-0.3.9.orig/liboil/sse/composite_sse_2pix.c   2005-12-21 
02:27:54.000000000 +0100
+++ liboil-0.3.9/liboil/sse/composite_sse_2pix.c        2006-06-20 
16:12:12.000000000 +0200
@@ -32,6 +32,29 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
@@ -41,20 +64,12 @@
  * the channel value in the low byte.  This means 2 pixels per pass.
  */
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_8x00ff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_8x00ff.ull =  {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+static const __m128i c_sse_8x00ff = 
+        {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL};
+static const __m128i c_sse_8x0080 = 
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
 
-#define MC(x) (c.sse_##x.m128)
+#define MC(x) (c_sse_##x)
 
 /* Shuffles the given value such that the alpha for each pixel appears in each
  * channel of the pixel.
@@ -188,7 +203,11 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_argb_const_src_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_2pix_wrap,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -216,7 +235,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix,
+OIL_SSE_WRAPPER(composite_in_argb_const_mask_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_2pix_wrap,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -272,7 +294,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_over_argb_const_src_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_2pix_wrap,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -309,8 +335,12 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix, composite_in_over_argb,
-    OIL_IMPL_FLAG_SSE2);
+
+OIL_SSE_WRAPPER(composite_in_over_argb_sse_2pix , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_sse_2pix_wrap, 
+        composite_in_over_argb, OIL_IMPL_FLAG_SSE2);
 
 static void
 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
@@ -348,7 +378,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_src_sse_2pix , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_2pix_wrap,
     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -387,7 +421,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_mask_sse_2pix, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_2pix_wrap,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
--- liboil-0.3.9.orig/liboil/sse/composite_sse_4pix.c   2005-12-21 
02:27:54.000000000 +0100
+++ liboil-0.3.9/liboil/sse/composite_sse_4pix.c        2006-06-20 
16:12:36.000000000 +0200
@@ -32,20 +32,36 @@
 #include <emmintrin.h>
 #include <liboil/liboilcolorspace.h>
 
-union m128_int {
-  __m128i m128;
-  uint64_t ull[2];
-};
-
-static const struct _SSEData {
-  union m128_int sse_16xff;
-  union m128_int sse_8x0080;
-} c = {
-    .sse_16xff.ull =   {0xffffffffffffffffULL, 0xffffffffffffffffULL},
-    .sse_8x0080.ull =  {0x0080008000800080ULL, 0x0080008000800080ULL},
-};
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x10,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
+
+static const __m128i c_sse_16xff =
+        {0xffffffffffffffffULL, 0xffffffffffffffffULL};
+static const __m128i c_sse_8x0080 =
+        {0x0080008000800080ULL, 0x0080008000800080ULL};
 
-#define MC(x) (c.sse_##x.m128)
+#define MC(x) (c_sse_##x)
 
 /* non-SSE2 compositing support */
 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
@@ -193,7 +209,11 @@
        COMPOSITE_IN(oil_argb_B(s), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_sse, composite_in_argb,
+
+OIL_SSE_WRAPPER(composite_in_argb_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_sse_wrap, composite_in_argb,
     OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -230,7 +250,11 @@
        COMPOSITE_IN(oil_argb_B(*src), m));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse,
+
+OIL_SSE_WRAPPER(composite_in_argb_const_src_sse , static void, 
+    uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_src_sse_wrap,
     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -267,7 +291,10 @@
        COMPOSITE_IN(oil_argb_B(s), mask[0]));
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse,
+OIL_SSE_WRAPPER(composite_in_argb_const_mask_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_argb_const_mask_sse_wrap,
     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -339,7 +366,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse,
+
+OIL_SSE_WRAPPER(composite_over_argb_const_src_sse, static void, 
+    uint32_t *dest, const uint32_t *src, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_over_argb_const_src_sse_wrap,
     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 
 static void
@@ -447,9 +478,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse,
-    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
+OIL_SSE_WRAPPER(composite_in_over_argb_const_src_sse , static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
 
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_src_sse_wrap,
+    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
 static void
 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
     const uint8_t *mask, int n)
@@ -502,7 +535,11 @@
     *dest++ = d;
   }
 }
-OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse,
+
+OIL_SSE_WRAPPER(composite_in_over_argb_const_mask_sse, static void, 
+        uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n)
+
+OIL_DEFINE_IMPL_FULL (composite_in_over_argb_const_mask_sse_wrap,
     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
 
 static void
--- liboil-0.3.9.orig/liboil/sse/sad8x8_sse.c   2005-12-23 22:46:25.000000000 
+0100
+++ liboil-0.3.9/liboil/sse/sad8x8_sse.c        2006-06-20 16:47:42.000000000 
+0200
@@ -31,6 +31,31 @@
 #include <liboil/liboilfunction.h>
 #include <emmintrin.h>
 
+/* Work around non-aligned stack frames (which causes the intristics to crash
+ * by making sure the stack frame is always aligned
+ */
+#define OIL_SSE_WRAPPER(name,ret, ...)                                  \
+    ret name(__VA_ARGS__) __attribute__((used));                        \
+    ret name ## _wrap (__VA_ARGS__) {                                   \
+        OIL_SSE_WRAPPER_CALL(name);                                     \
+    }
+
+#define OIL_SSE_WRAPPER_CALL(name)                                      \
+    asm volatile(                                                       \
+            "\n\t"                                                      \
+            "subl $0x18,%%esp\n\t"                                      \
+            "andl $0xfffffff0,%%esp\n\t"                                \
+                                                                        \
+            "movdqu 8(%%ebp),%%xmm0\n\t"                                \
+            "movdqa %%xmm0,(%%esp)\n\t"                                 \
+            "movl 0x18(%%ebp), %%ecx\n\t"                               \
+            "movl %%ecx, 0x10(%%esp)\n\t"                               \
+                                                                        \
+            "call " #name "\n\t"                                        \
+            "movl %%ebp,%%esp\n\t"                                      \
+            : :                                                         \
+            : "eax","ecx","edx","xmm0")
+
 union m128_int {
   __m128i m128;
   uint32_t i[4];
@@ -42,7 +67,7 @@
     int sstr2)
 {
   int i;
-  __m128i sum = _mm_setzero_si128();
+  __m128i sum __attribute__ ((aligned (16))) = _mm_setzero_si128();
   union m128_int sumi;
 
   for (i = 0; i < 4; i++) {
@@ -60,4 +85,7 @@
   sumi.m128 = sum;
   *dest = sumi.i[0] + sumi.i[2];
 }
-OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse, sad8x8_u8, OIL_IMPL_FLAG_SSE2);
+
+OIL_SSE_WRAPPER(sad8x8_u8_sse, static void, 
+        uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2)
+OIL_DEFINE_IMPL_FULL (sad8x8_u8_sse_wrap, sad8x8_u8, OIL_IMPL_FLAG_SSE2);

Attachment: signature.asc
Description: Digital signature

Reply via email to