Hi all,

> I've sat down a bit ;-) and came up with an Altivec-optimised IDCT
> implementation in vlc (well, I integrated Motorla's Altivec IDCT).

New release: this time it actually works ;-)

> This is in fact the same code that already exists in vlc for MacOS X,
> but it uses the Motorola-published assembler code (you can find it on
> their site).

... plus the final matrix transpose which they forgot :-)

I'll be on holiday for a week; I figured I'd send this off since it is
in a working state ;-) Diff should apply to both 0.2.82 and 0.2.83.

Have fun!

Michel

PS this is not Paul's IDCT, which I plan on integrating later on and
compare performance and accuracy against Motorola's.

-------------------------------------------------------------------------
Michel Lanners                 |  " Read Philosophy.  Study Art.
23, Rue Paul Henkes            |    Ask Questions.  Make Mistakes.
L-1710 Luxembourg              |
email   [EMAIL PROTECTED]            |
http://www.cpu.lu/~mlan        |                     Learn Always. "
diff -ur vlc-0.2.82/Makefile vlc-0.2.82-altivec/Makefile
--- vlc-0.2.82/Makefile Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/Makefile Sun Aug 26 10:04:20 2001
@@ -18,7 +18,7 @@
 #
 # All possible plugin objects
 #
-PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp 
dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome 
gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct 
idct/idctclassic idct/idctmmx idct/idctmmxext imdct/imdct imdct/imdct3dn 
imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion motion/motionmmx 
motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl text/ncurses text/rc 
x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx
+PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp 
dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome 
gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct 
idct/idctclassic idct/idctmmx idct/idctmmxext idct/idctaltivec imdct/imdct 
imdct/imdct3dn imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion 
motion/motionmmx motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl 
text/ncurses text/rc x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx
 
 #
 # C Objects
diff -ur vlc-0.2.82/Makefile.opts.in vlc-0.2.82-altivec/Makefile.opts.in
--- vlc-0.2.82/Makefile.opts.in Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/Makefile.opts.in Sun Aug 26 15:12:53 2001
@@ -45,7 +45,7 @@
 # Build environment
 # 
 CC = @CC@
-CFLAGS = @CFLAGS@
+CFLAGS = -Wa,-m7400 @CFLAGS@
 SHELL = @SHELL@
 RANLIB = @RANLIB@
 WINDRES = @WINDRES@
diff -ur vlc-0.2.82/configure vlc-0.2.82-altivec/configure
--- vlc-0.2.82/configure        Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/configure        Sun Aug 26 10:06:10 2001
@@ -3675,7 +3675,8 @@
   enableval="$enable_altivec"
    if test x$enableval = xyes; then ARCH="${ARCH} altivec"
     BUILTINS="${BUILTINS} idctaltivec"
-    LIB_IDCTALTIVEC="-framework vecLib"
+#    LIB_IDCTALTIVEC="-framework vecLib"
+    LIB_IDCTALTIVEC=""
   fi 
 fi
 
diff -ur vlc-0.2.82/include/vdec_ext-plugins.h 
vlc-0.2.82-altivec/include/vdec_ext-plugins.h
--- vlc-0.2.82/include/vdec_ext-plugins.h       Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/include/vdec_ext-plugins.h       Tue Aug 28 21:05:03 2001
@@ -103,6 +103,7 @@
 
     /* IDCT iformations */
     void *              p_idct_data;
+    void *              p_idct_data_raw;
 
     /* Input properties */
     struct vdec_pool_s * p_pool;
diff -ur vlc-0.2.82/plugins/idct/idctaltivec.c 
vlc-0.2.82-altivec/plugins/idct/idctaltivec.c
--- vlc-0.2.82/plugins/idct/idctaltivec.c       Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/plugins/idct/idctaltivec.c       Tue Aug 28 00:32:21 2001
@@ -23,6 +23,8 @@
 
 #define MODULE_NAME idctaltivec
 
+#undef DEBUG
+
 /*****************************************************************************
  * Preamble
  *****************************************************************************/
@@ -47,10 +49,14 @@
 #include "vdec_block.h"
 #include "vdec_idct.h"
 
-#include "idctaltivec.h"
+//#include "idctaltivec.h"
+//extern void IDCT(short *input, short *output);
+#include "idctaltivecasm.h"
 
 #include "modules_export.h"
 
+#include "testdata.h"
+//int dummy, dummy2;
 /*****************************************************************************
  * Local prototypes.
  *****************************************************************************/
@@ -115,7 +121,8 @@
     }
 
     /* The Altivec iDCT is deactivated until it really works */
-    return( 0 /* 200 */ );
+    //return( 0 /* 200 */ );
+    return( 200 );
 }
 
 /*****************************************************************************
@@ -130,6 +137,25 @@
  *****************************************************************************/
 void _M( vdec_IDCT )( void * p_idct_data, dctelem_t * p_block, int i_idontcare 
)
 {
+#ifdef DEBUG
+       int i;
+       
+       for(i=0; i<64;i++)
+           *(p_block+i)=testdata[i];
+
+       fprintf(stderr, "p_block alignment: 0x%p\n", p_block);
+       fprintf(stderr, "p_block before IDCT: 0x%04hx 0x%04hx 0x%04hx 0x%04hx 
0x%04hx 0x%04hx 0x%04hx 0x%04hx\n", *p_block, *(p_block+1), *(p_block+2), 
*(p_block+3), *(p_block+4), *(p_block+5), *(p_block+6), *(p_block+7));
+#endif
+
     IDCT( p_block, p_block );
+
+#ifdef DEBUG
+   fprintf(stderr, "p_block after IDCT:\n");
+    for (i=0;i<8;i++) {
+        dctelem_t *p=p_block+(i*8);
+        fprintf(stderr, " % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi\n", 
*p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5), *(p+6), *(p+7));
+    }
+   exit(0);
+#endif
 }
 
diff -ur vlc-0.2.82/plugins/idct/vdec_idct.c 
vlc-0.2.82-altivec/plugins/idct/vdec_idct.c
--- vlc-0.2.82/plugins/idct/vdec_idct.c Tue Aug  7 12:55:49 2001
+++ vlc-0.2.82-altivec/plugins/idct/vdec_idct.c Tue Aug 28 21:27:23 2001
@@ -57,7 +57,14 @@
     int i;
     dctelem_t * p_pre;
 
-    p_vdec->p_idct_data = malloc( sizeof(dctelem_t) * 64 * 64 );
+    /* the IDCT data buffer needs to meet certain alignment constraints
+     * (currently 16 bytes for Altivec vector ops)
+     */
+#define align 16
+
+    p_vdec->p_idct_data_raw = malloc( sizeof(dctelem_t) * 64 * 64 + align );
+    p_vdec->p_idct_data =
+           (void *)(((unsigned long)p_vdec->p_idct_data_raw + align - 1) & 
-align);
     p_pre = (dctelem_t *) p_vdec->p_idct_data;
     memset( p_pre, 0, 64 * 64 * sizeof(dctelem_t) );
 
diff -ur vlc-0.2.82/src/interface/main.c vlc-0.2.82-altivec/src/interface/main.c
--- vlc-0.2.82/src/interface/main.c     Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/src/interface/main.c     Sun Aug 26 13:26:16 2001
@@ -1031,6 +1031,7 @@
 {
     volatile int i_capabilities = CPU_CAPABILITY_NONE;
 
+        i_capabilities |= CPU_CAPABILITY_ALTIVEC;
 #if defined( SYS_BEOS )
     i_capabilities |= CPU_CAPABILITY_486
                       | CPU_CAPABILITY_586
diff -ur vlc-0.2.82/src/video_decoder/video_decoder.c 
vlc-0.2.82-altivec/src/video_decoder/video_decoder.c
--- vlc-0.2.82/src/video_decoder/video_decoder.c        Tue Aug  7 12:55:48 2001
+++ vlc-0.2.82-altivec/src/video_decoder/video_decoder.c        Tue Aug 28 
21:08:38 2001
@@ -166,9 +166,9 @@
 {
     intf_DbgMsg("vdec debug: EndThread(%p)", p_vdec);
 
-    if( p_vdec->p_idct_data != NULL )
+    if( p_vdec->p_idct_data_raw != NULL )
     {
-        free( p_vdec->p_idct_data );
+        free( p_vdec->p_idct_data_raw );
     }
 
     free( p_vdec );
diff -uNr vlc-0.2.82/plugins/idct/idctaltivecasm.h 
vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h
--- vlc-0.2.82/plugins/idct/idctaltivecasm.h    Thu Jan  1 01:00:00 1970
+++ vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h    Tue Aug 28 00:00:07 2001
@@ -0,0 +1,211 @@
+/* IDCT ASM function from Motorola
+ *
+ * The original Motorola implementation lacks a matrix transpose
+ * operation on the result. Duh...
+ */
+
+/***************************************************************
+ *
+ * Copyright:   (c) Copyright Motorola Inc. 1998
+ *
+ * Date:        April 17, 1998
+ *
+ * Function:    IDCT
+ *
+ * Description: Scaled Chen (III) algorithm for IDCT
+ *              Arithmetic is 16-bit fixed point.
+ *
+ * Inputs:      input - Pointer to input data (short), which
+ *                      must be between -2048 to +2047.
+ *                      It is assumed that the allocated array
+ *                      has been 128-bit aligned and contains
+ *                      8x8 short elements.
+ *
+ * Outputs:     output - Pointer to output area for the transfored
+ *                       data. The output values are between -255
+ *                       and 255 . It is assumed that a 128-bit
+ *                       aligned 8x8 array of short has been
+ *                       pre-allocated.
+ *
+ * Return:      None
+ *
+ ***************************************************************/
+
+signed short SpecialConstants[8] __attribute__ ((aligned (16))) = {
+                       23170, 13573, 6518, 21895, -23170, -21895, 0, 0 };
+
+signed short PreScale[64] __attribute__ ((aligned (16))) = {
+                       4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+                       5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880,
+                       5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+                       4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+                       4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681,
+                       4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680,
+                       5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422,
+                       5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 };
+
+static __inline__ void IDCT(short *input, short *output) {
+
+
+//     fprintf(stderr, "Input: %p, Addresses: %p, %p\n", input,
+//                     &SpecialConstants[0], &PreScale[0]);
+
+       asm volatile ( "
+               addi    11,0,16
+               addi    9,0,32
+               lvx     0,%1,11
+               addi    7,0,48
+               lvx     1,%3,11
+               vspltisw        2,0
+               lvx     3,0,%2
+               addi    10,0,80
+               lvx     4,%1,9
+               vmhraddshs      5,0,1,2
+               lvx     6,%3,9
+               addi    6,0,112
+               lvx     7,%1,7
+               vsplth  8,3,0x2
+               lvx     9,%3,7
+               vmhraddshs      10,4,6,2
+               lvx     11,%1,10
+               vsplth  12,3,0x1
+               lvx     13,%3,10
+               vsplth  14,3,0x3
+               lvx     15,%1,6
+               vmhraddshs      16,7,9,2
+               lvx     17,0,%1
+               vsplth  18,3,0x5
+               lvx     19,%3,6
+               addi    5,0,64
+               lvx     1,0,%3
+               vmhraddshs      0,11,13,2
+               addi    8,0,96
+               lvx     6,%1,5
+               lvx     4,%3,5
+               vmhraddshs      9,15,19,2
+               vmhraddshs      7,17,1,2
+               lvx     13,%1,8
+               lvx     11,%3,8
+               vmhraddshs      19,6,4,2
+               vmhraddshs      15,8,5,2
+               vsplth  1,3,0x0
+               vmhraddshs      17,13,11,2
+               vsplth  4,3,0x4
+               vmhraddshs      6,8,9,5
+               vmhraddshs      11,14,0,16
+               vmhraddshs      13,18,16,0
+               vmhraddshs      3,12,10,2
+               vsubshs 5,15,9
+               vsubshs 0,7,19
+               vsubshs 16,3,17
+               vmhraddshs      15,12,17,10
+               vsubshs 9,5,13
+               vsubshs 3,6,11
+               vaddshs 17,7,19
+               vaddshs 10,0,16
+               vsubshs 19,0,16
+               vsubshs 7,3,9
+               vaddshs 16,3,9
+               vaddshs 0,5,13
+               vmhraddshs      3,1,16,10
+               vaddshs 9,6,11
+               vmhraddshs      5,4,16,10
+               vaddshs 13,17,15
+               vmhraddshs      11,1,7,19
+               vsubshs 6,17,15
+               vmhraddshs      16,4,7,19
+               vaddshs 10,13,9
+               vmrghh  17,11,5
+               vsubshs 15,13,9
+               vmrglh  7,11,5
+               vaddshs 19,6,0
+               vmrghh  13,3,16
+               vsubshs 9,6,0
+               vmrghh  11,19,15
+               vmrghh  5,10,9
+               vmrglh  6,10,9
+               vmrglh  0,3,16
+               vmrglh  9,19,15
+               vmrghh  10,5,17
+               vmrghh  16,13,11
+               vmrglh  3,5,17
+               vmrghh  19,6,7
+               vmrglh  15,6,7
+               vmrglh  5,13,11
+               vmrghh  17,0,9
+               vmrglh  6,0,9
+               vmrglh  7,10,16
+               vmrghh  11,3,5
+               vmhraddshs      13,8,7,2
+               vmrglh  9,3,5
+               vmhraddshs      0,12,11,2
+               vmrglh  5,19,17
+               vmrglh  3,15,6
+               vmhraddshs      2,14,5,9
+               vmhraddshs      14,8,3,7
+               vmrghh  7,10,16
+               vmhraddshs      8,18,9,5
+               vmrghh  16,19,17
+               vmrghh  10,15,6
+               vsubshs 5,13,3
+               vsubshs 9,7,16
+               vsubshs 18,0,10
+               vmhraddshs      17,12,10,11
+               vsubshs 19,5,8
+               vsubshs 6,14,2
+               vaddshs 15,7,16
+               vaddshs 3,9,18
+               vsubshs 13,9,18
+               vsubshs 0,6,19
+               vaddshs 10,6,19
+               vaddshs 11,5,8
+               vmhraddshs      12,1,10,3
+               vaddshs         16,14,2
+               vmhraddshs      7,4,10,3
+               vaddshs         18,15,17
+               vmhraddshs      9,1,0,13
+               vsubshs         6,15,17
+               vmhraddshs      19,4,0,13
+               vaddshs         5,18,16
+               vsubshs         8,18,16
+               vaddshs         14,6,11
+               vsubshs         2,6,11
+               vmrghh          0,5,2
+               vmrglh          1,5,2
+               vmrghh          3,12,19
+               vmrglh          4,12,19
+               vmrghh          10,9,7
+               vmrglh          13,9,7
+               vmrghh          15,14,8
+               vmrglh          17,14,8
+               vmrghh          5,0,10
+               vmrglh          12,0,10
+               vmrghh          9,1,13
+               vmrglh          14,1,13
+               vmrghh          2,3,15
+               vmrglh          19,3,15
+               vmrghh          7,4,17
+               vmrglh          8,4,17
+               vmrghh          0,5,2
+               vmrglh          1,5,2
+               stvx            0,0,%0
+               vmrghh          3,12,19
+               stvx            1,%0,11
+               vmrglh          4,12,19
+               stvx            3,%0,9
+               vmrghh          10,9,7
+               stvx            4,%0,7
+               vmrglh          13,9,7
+               stvx            10,%0,5
+               vmrghh          15,14,8
+               stvx            13,%0,10
+               vmrglh          17,14,8
+               stvx            15,%0,8
+               stvx            17,%0,6
+               "
+               :
+               : "r" (output), "r" (input), "r" (SpecialConstants), "r" 
(PreScale)
+               : "cc", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory" );
+       /* End asm */
+
+}
diff -uNr vlc-0.2.82/plugins/idct/testdata.h 
vlc-0.2.82-altivec/plugins/idct/testdata.h
--- vlc-0.2.82/plugins/idct/testdata.h  Thu Jan  1 01:00:00 1970
+++ vlc-0.2.82-altivec/plugins/idct/testdata.h  Mon Aug 27 22:01:57 2001
@@ -0,0 +1,12 @@
+/* Testdata for IDCT */
+
+dctelem_t testdata[] = {
+   131,     0,     0,   131,   -51,     0,     0,     0,
+     0,   -51,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,   -51,     0,     0,     0,     0,
+     0,     0,   101,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,   -51,     0,     0,
+     0,     0,     0,     0,   101,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,   101,     0,     0,     0,   101 };
+

Reply via email to