Hi all, > I've sat down a bit ;-) and came up with an Altivec-optimised IDCT > implementation in vlc (well, I integrated Motorla's Altivec IDCT).
New release: this time it actually works ;-) > This is in fact the same code that already exists in vlc for MacOS X, > but it uses the Motorola-published assembler code (you can find it on > their site). ... plus the final matrix transpose which they forgot :-) I'll be on holiday for a week; I figured I'd send this off since it is in a working state ;-) Diff should apply to both 0.2.82 and 0.2.83. Have fun! Michel PS this is not Paul's IDCT, which I plan on integrating later on and compare performance and accuracy against Motorola's. ------------------------------------------------------------------------- Michel Lanners | " Read Philosophy. Study Art. 23, Rue Paul Henkes | Ask Questions. Make Mistakes. L-1710 Luxembourg | email [EMAIL PROTECTED] | http://www.cpu.lu/~mlan | Learn Always. "
diff -ur vlc-0.2.82/Makefile vlc-0.2.82-altivec/Makefile --- vlc-0.2.82/Makefile Tue Aug 7 12:55:49 2001 +++ vlc-0.2.82-altivec/Makefile Sun Aug 26 10:04:20 2001 @@ -18,7 +18,7 @@ # # All possible plugin objects # -PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext imdct/imdct imdct/imdct3dn imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion motion/motionmmx motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl text/ncurses text/rc x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx +PLUGINS_TARGETS := alsa/alsa beos/beos darwin/darwin directx/directx dsp/dsp dummy/dummy dummy/null dvd/dvd esd/esd fb/fb ggi/ggi glide/glide gtk/gnome gtk/gtk downmix/downmix downmix/downmixsse downmix/downmix3dn idct/idct idct/idctclassic idct/idctmmx idct/idctmmxext idct/idctaltivec imdct/imdct imdct/imdct3dn imdct/imdctsse kde/kde macosx/macosx mga/mga motion/motion motion/motionmmx motion/motionmmxext mpeg/es mpeg/ps mpeg/ts qt/qt sdl/sdl text/ncurses text/rc x11/x11 x11/xvideo yuv/yuv yuv/yuvmmx # # C Objects diff -ur vlc-0.2.82/Makefile.opts.in vlc-0.2.82-altivec/Makefile.opts.in --- vlc-0.2.82/Makefile.opts.in Tue Aug 7 12:55:49 2001 +++ vlc-0.2.82-altivec/Makefile.opts.in Sun Aug 26 15:12:53 2001 @@ -45,7 +45,7 @@ # Build environment # CC = @CC@ -CFLAGS = @CFLAGS@ +CFLAGS = -Wa,-m7400 @CFLAGS@ SHELL = @SHELL@ RANLIB = @RANLIB@ WINDRES = @WINDRES@ diff -ur vlc-0.2.82/configure vlc-0.2.82-altivec/configure --- vlc-0.2.82/configure Tue Aug 7 12:55:49 2001 +++ vlc-0.2.82-altivec/configure Sun Aug 26 10:06:10 2001 @@ -3675,7 +3675,8 @@ enableval="$enable_altivec" if test x$enableval = xyes; then ARCH="${ARCH} altivec" BUILTINS="${BUILTINS} idctaltivec" - LIB_IDCTALTIVEC="-framework vecLib" +# LIB_IDCTALTIVEC="-framework vecLib" + LIB_IDCTALTIVEC="" fi fi diff -ur vlc-0.2.82/include/vdec_ext-plugins.h vlc-0.2.82-altivec/include/vdec_ext-plugins.h --- vlc-0.2.82/include/vdec_ext-plugins.h Tue Aug 7 12:55:48 2001 +++ vlc-0.2.82-altivec/include/vdec_ext-plugins.h Tue Aug 28 21:05:03 2001 @@ -103,6 +103,7 @@ /* IDCT iformations */ void * p_idct_data; + void * p_idct_data_raw; /* Input properties */ struct vdec_pool_s * p_pool; diff -ur vlc-0.2.82/plugins/idct/idctaltivec.c vlc-0.2.82-altivec/plugins/idct/idctaltivec.c --- vlc-0.2.82/plugins/idct/idctaltivec.c Tue Aug 7 12:55:49 2001 +++ vlc-0.2.82-altivec/plugins/idct/idctaltivec.c Tue Aug 28 00:32:21 2001 @@ -23,6 +23,8 @@ #define MODULE_NAME idctaltivec +#undef DEBUG + /***************************************************************************** * Preamble *****************************************************************************/ @@ -47,10 +49,14 @@ #include "vdec_block.h" #include "vdec_idct.h" -#include "idctaltivec.h" +//#include "idctaltivec.h" +//extern void IDCT(short *input, short *output); +#include "idctaltivecasm.h" #include "modules_export.h" +#include "testdata.h" +//int dummy, dummy2; /***************************************************************************** * Local prototypes. *****************************************************************************/ @@ -115,7 +121,8 @@ } /* The Altivec iDCT is deactivated until it really works */ - return( 0 /* 200 */ ); + //return( 0 /* 200 */ ); + return( 200 ); } /***************************************************************************** @@ -130,6 +137,25 @@ *****************************************************************************/ void _M( vdec_IDCT )( void * p_idct_data, dctelem_t * p_block, int i_idontcare ) { +#ifdef DEBUG + int i; + + for(i=0; i<64;i++) + *(p_block+i)=testdata[i]; + + fprintf(stderr, "p_block alignment: 0x%p\n", p_block); + fprintf(stderr, "p_block before IDCT: 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx 0x%04hx\n", *p_block, *(p_block+1), *(p_block+2), *(p_block+3), *(p_block+4), *(p_block+5), *(p_block+6), *(p_block+7)); +#endif + IDCT( p_block, p_block ); + +#ifdef DEBUG + fprintf(stderr, "p_block after IDCT:\n"); + for (i=0;i<8;i++) { + dctelem_t *p=p_block+(i*8); + fprintf(stderr, " % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi % 5hi\n", *p, *(p+1), *(p+2), *(p+3), *(p+4), *(p+5), *(p+6), *(p+7)); + } + exit(0); +#endif } diff -ur vlc-0.2.82/plugins/idct/vdec_idct.c vlc-0.2.82-altivec/plugins/idct/vdec_idct.c --- vlc-0.2.82/plugins/idct/vdec_idct.c Tue Aug 7 12:55:49 2001 +++ vlc-0.2.82-altivec/plugins/idct/vdec_idct.c Tue Aug 28 21:27:23 2001 @@ -57,7 +57,14 @@ int i; dctelem_t * p_pre; - p_vdec->p_idct_data = malloc( sizeof(dctelem_t) * 64 * 64 ); + /* the IDCT data buffer needs to meet certain alignment constraints + * (currently 16 bytes for Altivec vector ops) + */ +#define align 16 + + p_vdec->p_idct_data_raw = malloc( sizeof(dctelem_t) * 64 * 64 + align ); + p_vdec->p_idct_data = + (void *)(((unsigned long)p_vdec->p_idct_data_raw + align - 1) & -align); p_pre = (dctelem_t *) p_vdec->p_idct_data; memset( p_pre, 0, 64 * 64 * sizeof(dctelem_t) ); diff -ur vlc-0.2.82/src/interface/main.c vlc-0.2.82-altivec/src/interface/main.c --- vlc-0.2.82/src/interface/main.c Tue Aug 7 12:55:48 2001 +++ vlc-0.2.82-altivec/src/interface/main.c Sun Aug 26 13:26:16 2001 @@ -1031,6 +1031,7 @@ { volatile int i_capabilities = CPU_CAPABILITY_NONE; + i_capabilities |= CPU_CAPABILITY_ALTIVEC; #if defined( SYS_BEOS ) i_capabilities |= CPU_CAPABILITY_486 | CPU_CAPABILITY_586 diff -ur vlc-0.2.82/src/video_decoder/video_decoder.c vlc-0.2.82-altivec/src/video_decoder/video_decoder.c --- vlc-0.2.82/src/video_decoder/video_decoder.c Tue Aug 7 12:55:48 2001 +++ vlc-0.2.82-altivec/src/video_decoder/video_decoder.c Tue Aug 28 21:08:38 2001 @@ -166,9 +166,9 @@ { intf_DbgMsg("vdec debug: EndThread(%p)", p_vdec); - if( p_vdec->p_idct_data != NULL ) + if( p_vdec->p_idct_data_raw != NULL ) { - free( p_vdec->p_idct_data ); + free( p_vdec->p_idct_data_raw ); } free( p_vdec ); diff -uNr vlc-0.2.82/plugins/idct/idctaltivecasm.h vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h --- vlc-0.2.82/plugins/idct/idctaltivecasm.h Thu Jan 1 01:00:00 1970 +++ vlc-0.2.82-altivec/plugins/idct/idctaltivecasm.h Tue Aug 28 00:00:07 2001 @@ -0,0 +1,211 @@ +/* IDCT ASM function from Motorola + * + * The original Motorola implementation lacks a matrix transpose + * operation on the result. Duh... + */ + +/*************************************************************** + * + * Copyright: (c) Copyright Motorola Inc. 1998 + * + * Date: April 17, 1998 + * + * Function: IDCT + * + * Description: Scaled Chen (III) algorithm for IDCT + * Arithmetic is 16-bit fixed point. + * + * Inputs: input - Pointer to input data (short), which + * must be between -2048 to +2047. + * It is assumed that the allocated array + * has been 128-bit aligned and contains + * 8x8 short elements. + * + * Outputs: output - Pointer to output area for the transfored + * data. The output values are between -255 + * and 255 . It is assumed that a 128-bit + * aligned 8x8 array of short has been + * pre-allocated. + * + * Return: None + * + ***************************************************************/ + +signed short SpecialConstants[8] __attribute__ ((aligned (16))) = { + 23170, 13573, 6518, 21895, -23170, -21895, 0, 0 }; + +signed short PreScale[64] __attribute__ ((aligned (16))) = { + 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681, + 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880, + 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422, + 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680, + 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681, + 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680, + 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422, + 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 }; + +static __inline__ void IDCT(short *input, short *output) { + + +// fprintf(stderr, "Input: %p, Addresses: %p, %p\n", input, +// &SpecialConstants[0], &PreScale[0]); + + asm volatile ( " + addi 11,0,16 + addi 9,0,32 + lvx 0,%1,11 + addi 7,0,48 + lvx 1,%3,11 + vspltisw 2,0 + lvx 3,0,%2 + addi 10,0,80 + lvx 4,%1,9 + vmhraddshs 5,0,1,2 + lvx 6,%3,9 + addi 6,0,112 + lvx 7,%1,7 + vsplth 8,3,0x2 + lvx 9,%3,7 + vmhraddshs 10,4,6,2 + lvx 11,%1,10 + vsplth 12,3,0x1 + lvx 13,%3,10 + vsplth 14,3,0x3 + lvx 15,%1,6 + vmhraddshs 16,7,9,2 + lvx 17,0,%1 + vsplth 18,3,0x5 + lvx 19,%3,6 + addi 5,0,64 + lvx 1,0,%3 + vmhraddshs 0,11,13,2 + addi 8,0,96 + lvx 6,%1,5 + lvx 4,%3,5 + vmhraddshs 9,15,19,2 + vmhraddshs 7,17,1,2 + lvx 13,%1,8 + lvx 11,%3,8 + vmhraddshs 19,6,4,2 + vmhraddshs 15,8,5,2 + vsplth 1,3,0x0 + vmhraddshs 17,13,11,2 + vsplth 4,3,0x4 + vmhraddshs 6,8,9,5 + vmhraddshs 11,14,0,16 + vmhraddshs 13,18,16,0 + vmhraddshs 3,12,10,2 + vsubshs 5,15,9 + vsubshs 0,7,19 + vsubshs 16,3,17 + vmhraddshs 15,12,17,10 + vsubshs 9,5,13 + vsubshs 3,6,11 + vaddshs 17,7,19 + vaddshs 10,0,16 + vsubshs 19,0,16 + vsubshs 7,3,9 + vaddshs 16,3,9 + vaddshs 0,5,13 + vmhraddshs 3,1,16,10 + vaddshs 9,6,11 + vmhraddshs 5,4,16,10 + vaddshs 13,17,15 + vmhraddshs 11,1,7,19 + vsubshs 6,17,15 + vmhraddshs 16,4,7,19 + vaddshs 10,13,9 + vmrghh 17,11,5 + vsubshs 15,13,9 + vmrglh 7,11,5 + vaddshs 19,6,0 + vmrghh 13,3,16 + vsubshs 9,6,0 + vmrghh 11,19,15 + vmrghh 5,10,9 + vmrglh 6,10,9 + vmrglh 0,3,16 + vmrglh 9,19,15 + vmrghh 10,5,17 + vmrghh 16,13,11 + vmrglh 3,5,17 + vmrghh 19,6,7 + vmrglh 15,6,7 + vmrglh 5,13,11 + vmrghh 17,0,9 + vmrglh 6,0,9 + vmrglh 7,10,16 + vmrghh 11,3,5 + vmhraddshs 13,8,7,2 + vmrglh 9,3,5 + vmhraddshs 0,12,11,2 + vmrglh 5,19,17 + vmrglh 3,15,6 + vmhraddshs 2,14,5,9 + vmhraddshs 14,8,3,7 + vmrghh 7,10,16 + vmhraddshs 8,18,9,5 + vmrghh 16,19,17 + vmrghh 10,15,6 + vsubshs 5,13,3 + vsubshs 9,7,16 + vsubshs 18,0,10 + vmhraddshs 17,12,10,11 + vsubshs 19,5,8 + vsubshs 6,14,2 + vaddshs 15,7,16 + vaddshs 3,9,18 + vsubshs 13,9,18 + vsubshs 0,6,19 + vaddshs 10,6,19 + vaddshs 11,5,8 + vmhraddshs 12,1,10,3 + vaddshs 16,14,2 + vmhraddshs 7,4,10,3 + vaddshs 18,15,17 + vmhraddshs 9,1,0,13 + vsubshs 6,15,17 + vmhraddshs 19,4,0,13 + vaddshs 5,18,16 + vsubshs 8,18,16 + vaddshs 14,6,11 + vsubshs 2,6,11 + vmrghh 0,5,2 + vmrglh 1,5,2 + vmrghh 3,12,19 + vmrglh 4,12,19 + vmrghh 10,9,7 + vmrglh 13,9,7 + vmrghh 15,14,8 + vmrglh 17,14,8 + vmrghh 5,0,10 + vmrglh 12,0,10 + vmrghh 9,1,13 + vmrglh 14,1,13 + vmrghh 2,3,15 + vmrglh 19,3,15 + vmrghh 7,4,17 + vmrglh 8,4,17 + vmrghh 0,5,2 + vmrglh 1,5,2 + stvx 0,0,%0 + vmrghh 3,12,19 + stvx 1,%0,11 + vmrglh 4,12,19 + stvx 3,%0,9 + vmrghh 10,9,7 + stvx 4,%0,7 + vmrglh 13,9,7 + stvx 10,%0,5 + vmrghh 15,14,8 + stvx 13,%0,10 + vmrglh 17,14,8 + stvx 15,%0,8 + stvx 17,%0,6 + " + : + : "r" (output), "r" (input), "r" (SpecialConstants), "r" (PreScale) + : "cc", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "memory" ); + /* End asm */ + +} diff -uNr vlc-0.2.82/plugins/idct/testdata.h vlc-0.2.82-altivec/plugins/idct/testdata.h --- vlc-0.2.82/plugins/idct/testdata.h Thu Jan 1 01:00:00 1970 +++ vlc-0.2.82-altivec/plugins/idct/testdata.h Mon Aug 27 22:01:57 2001 @@ -0,0 +1,12 @@ +/* Testdata for IDCT */ + +dctelem_t testdata[] = { + 131, 0, 0, 131, -51, 0, 0, 0, + 0, -51, 0, 0, 0, 0, 0, 0, + 0, 0, 0, -51, 0, 0, 0, 0, + 0, 0, 101, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, -51, 0, 0, + 0, 0, 0, 0, 101, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 101, 0, 0, 0, 101 }; +