Changes in directory llvm/examples/SIMD/DCT:
Makefile added (r1.1.2.1) dct.altivec.handwritten.c added (r1.1.2.1) dct.sse.handwritten.c added (r1.1.2.1) dct.vectorc.c added (r1.1.2.1) main.c added (r1.1.2.1) --- Log message: Examples to illustrate Vector LLVM's SIMD support. --- Diffs of the changes: (+646 -0) Makefile | 4 dct.altivec.handwritten.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++ dct.sse.handwritten.c | 129 +++++++++++++++++++++++++++++ dct.vectorc.c | 140 ++++++++++++++++++++++++++++++++ main.c | 175 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 646 insertions Index: llvm/examples/SIMD/DCT/Makefile diff -c /dev/null llvm/examples/SIMD/DCT/Makefile:1.1.2.1 *** /dev/null Sun Oct 23 17:49:50 2005 --- llvm/examples/SIMD/DCT/Makefile Sun Oct 23 17:49:39 2005 *************** *** 0 **** --- 1,4 ---- + NAME= dct + + include ../Makefile.common + Index: llvm/examples/SIMD/DCT/dct.altivec.handwritten.c diff -c /dev/null llvm/examples/SIMD/DCT/dct.altivec.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:17 2005 --- llvm/examples/SIMD/DCT/dct.altivec.handwritten.c Sun Oct 23 17:49:39 2005 *************** *** 0 **** --- 1,198 ---- + static inline void Matrix_Transpose ( vector signed short *input, vector signed short *output) + { + vector signed short a0, a1, a2, a3, a4, a5, a6, a7; + vector signed short b0, b1, b2, b3, b4, b5, b6, b7; + + b0 = vec_mergeh( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/ + b1 = vec_mergel( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/ + b2 = vec_mergeh( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/ + b3 = vec_mergel( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/ + b4 = vec_mergeh( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/ + b5 = vec_mergel( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/ + b6 = vec_mergeh( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/ + b7 = vec_mergel( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/ + + a0 = vec_mergeh( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/ + a1 = vec_mergel( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/ + a2 = vec_mergeh( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/ + a3 = vec_mergel( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/ + a4 = vec_mergeh( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/ + a5 = vec_mergel( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/ + a6 = vec_mergeh( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/ + a7 = vec_mergel( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/ + + output[0] = vec_mergeh( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/ + output[1] = vec_mergel( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/ + output[2] = vec_mergeh( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/ + output[3] = vec_mergel( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/ + output[4] = vec_mergeh( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/ + output[5] = vec_mergel( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/ + output[6] = vec_mergeh( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/ + output[7] = vec_mergel( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/ + + } + + /*************************************************************** + * + * Copyright: (c) Copyright Motorola Inc. 1998 + * + * Date: April 15, 1998 + * + * Macro: DCT_Transform + * + * Description: Discrete Cosign Transform implemented by the + * Scaled Chen (II) Algorithm developed by Haifa + * Research Lab. The major differnce between this + * algorithm and the Scaled Chen (I) is that + * certain multiply-subtracts are replaced by + * multiply adds. A full description of the + * Scaled Chen (I) algorithm can be found in: + * W.C.Chen, C.H.Smith and S.C.Fralick, "A Fast + * Computational Algorithm for the Discrete Cosine + * Transform", IEEE Transactions on Cummnuications, + * Vol. COM-25, No. 9, pp 1004-1009, Sept. 1997. + * + * Inputs: vx : array of vector short + * t1-t10 : temporary vector variables set up by caller + * c4 : cos(4*pi/16) + * mc4 : -c4 + * a0 : c6/c2 + * a1 : c7/c1 + * a2 : c5/c3 + * ma2 : -a2 + * zero : an array of zero elements + * + * Outputs: vy : array of vector short + * + **************************************************************/ + + #define DCT_Transform(vx,vy) \ + \ + /* 1st stage. */ \ + t8 = vec_adds( vx[0], vx[7] ); /* t0 + t7 */ \ + t9 = vec_subs( vx[0], vx[7] ); /* t0 - t7 */ \ + t0 = vec_adds( vx[1], vx[6] ); /* t1 + t6 */ \ + t7 = vec_subs( vx[1], vx[6] ); /* t1 - t6 */ \ + t1 = vec_adds( vx[2], vx[5] ); /* t2 + t6 */ \ + t6 = vec_subs( vx[2], vx[5] ); /* t2 - t6 */ \ + t2 = vec_adds( vx[3], vx[4] ); /* t3 + t4 */ \ + t5 = vec_subs( vx[3], vx[4] ); /* t3 - t4 */ \ + \ + /* 2nd stage. */ \ + t3 = vec_adds( t8, t2 ); /* (t0+t7) + (t3+t4) */ \ + t4 = vec_subs( t8, t2 ); /* (t0+t7) - (t3+t4) */ \ + t2 = vec_adds( t0, t1 ); /* (t1+t6) + (t2+t5) */ \ + t8 = vec_subs( t0, t1 ); /* (t1+t6) - (t2+t5) */ \ + \ + t1 = vec_adds( t7, t6 ); /* (t1-t6) + (t2-t5) */ \ + t0 = vec_subs( t7, t6 ); /* (t1-t6) - (t2-t5) */ \ + \ + /* 3rd stage */ \ + vy[0] = vec_adds( t3, t2 ); /* y0 = t3 + t2 */ \ + vy[4] = vec_subs( t3, t2 ); /* y4 = t3 + t2 */ \ + vy[2] = vec_mradds( t8, a0, t4 ); /* y2 = t8 * (a0) + t4 */ \ + t10 = vec_mradds( t4, a0, zero ); \ + vy[6] = vec_subs( t10, t8 ); /* y6 = t4 * (a0) - t8 */ \ + \ + t6 = vec_mradds( t0, c4, t5 ); /* t6 = t0 * (c4) + t5 */ \ + t7 = vec_mradds( t0, mc4, t5 ); /* t7 = t0 * (-c4) + t5 */ \ + t2 = vec_mradds( t1, mc4, t9 ); /* t2 = t1 * (-c4) + t9 */ \ + t3 = vec_mradds( t1, c4, t9 ); /* t3 = t1 * (c4) + t9 */ \ + \ + /* 4th stage. */ \ + vy[1] = vec_mradds( t6, a1, t3 ); /* y1 = t6 * (a1) + t3 */ \ + t9 = vec_mradds( t3, a1, zero ); \ + vy[7] = vec_subs( t9, t6 ) ; /* y7 = t3 * (a1) - t6 */ \ + vy[5] = vec_mradds( t2, a2, t7 ); /* y5 = t2 + (a2) + t7 */ \ + vy[3] = vec_mradds( t7, ma2, t2 ); /* y3 = t7 * (-a2) + t2 */ + + /* Post-scaling matrix -- scaled by 1 */ + vector signed short PostScale[8] = { + (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ), + (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ), + (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ), + (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ), + (vector signed short)( 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681 ), + (vector signed short)( 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680 ), + (vector signed short)( 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422 ), + (vector signed short)( 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 ) + }; + + /*************************************************************** + * + * Copyright: (c) Copyright Motorola Inc. 1998 + * + * Date: April 17, 1998 + * + * Function: DCT + * + * Description: Scaled Chen (II) algorithm for DCT + * Arithmetic is 16-bit fixed point. + * + * Inputs: input - Pointer to input data (short), which + * must be between -255 to +255. + * It is assumed that the allocated array + * has been 128-bit aligned and contains + * 8x8 short elements. + * + * Outputs: output - Pointer to output area for the transfored + * data. The output values are between -2040 + * and 2040. It is assumed that a 128-bit + * aligned 8x8 array of short has been + * pre-allocated. + * + * Return: None + * + ***************************************************************/ + + void dct_vector(short *input, short *output) { + + vector signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + vector signed short a0, a1, a2, ma2, c4, mc4, zero; + vector signed short vx[8], vy[8]; + vector signed short *vec_ptr; /* used for conversion between + arrays of short and vector + signed short array. */ + + /* load the multiplication constants */ + c4 = (vector signed short)(23170); /* c4 = cos(4*pi/16) */ + a0 = (vector signed short)(13573); /* a0 = c6/c2 */ + a1 = (vector signed short)(6518); /* a1 = c7/c1 */ + a2 = (vector signed short)(21895); /* a2 = c5/c3 */ + mc4 = (vector signed short)(-23170); /* -c4 */ + ma2 = (vector signed short)(-21895); /* -a2 */ + zero = (vector signed short)(0); /* 0 */ + + /* copy the rows of input data */ + vec_ptr = ( vector signed short * ) input; + vx[0] = vec_ptr[0]; + vx[1] = vec_ptr[1]; + vx[2] = vec_ptr[2]; + vx[3] = vec_ptr[3]; + vx[4] = vec_ptr[4]; + vx[5] = vec_ptr[5]; + vx[6] = vec_ptr[6]; + vx[7] = vec_ptr[7]; + + /* Perform DCT first on the 8 columns */ + DCT_Transform( vx, vy ); + + /* Transpose matrix to work on rows */ + Matrix_Transpose( vy, vx ); + + /* Perform DCT first on the 8 rows */ + DCT_Transform( vx, vy ); + + /* Post-scale and store result. */ + vec_ptr = (vector signed short *) output; + + vec_ptr[0] = vec_mradds( PostScale[0], vy[0], zero ); + vec_ptr[1] = vec_mradds( PostScale[1], vy[1], zero ); + vec_ptr[2] = vec_mradds( PostScale[2], vy[2], zero ); + vec_ptr[3] = vec_mradds( PostScale[3], vy[3], zero ); + vec_ptr[4] = vec_mradds( PostScale[4], vy[4], zero ); + vec_ptr[5] = vec_mradds( PostScale[5], vy[5], zero ); + vec_ptr[6] = vec_mradds( PostScale[6], vy[6], zero ); + vec_ptr[7] = vec_mradds( PostScale[7], vy[7], zero ); + + } Index: llvm/examples/SIMD/DCT/dct.sse.handwritten.c diff -c /dev/null llvm/examples/SIMD/DCT/dct.sse.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/DCT/dct.sse.handwritten.c Sun Oct 23 17:49:39 2005 *************** *** 0 **** --- 1,129 ---- + #include "Scalar.h" + #include "SSE.h" + + extern short *PostScalePtr; + + static inline void Matrix_Transpose ( short *input_scalar, short *output_scalar) + { + __m128i *input = (__m128i*) input_scalar; + __m128i *output = (__m128i*) output_scalar; + + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + + b0 = _mm_unpacklo_epi16( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/ + b1 = _mm_unpackhi_epi16( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/ + b2 = _mm_unpacklo_epi16( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/ + b3 = _mm_unpackhi_epi16( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/ + b4 = _mm_unpacklo_epi16( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/ + b5 = _mm_unpackhi_epi16( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/ + b6 = _mm_unpacklo_epi16( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/ + b7 = _mm_unpackhi_epi16( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/ + + a0 = _mm_unpacklo_epi16( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/ + a1 = _mm_unpackhi_epi16( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/ + a2 = _mm_unpacklo_epi16( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/ + a3 = _mm_unpackhi_epi16( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/ + a4 = _mm_unpacklo_epi16( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/ + a5 = _mm_unpackhi_epi16( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/ + a6 = _mm_unpacklo_epi16( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/ + a7 = _mm_unpackhi_epi16( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/ + + output[0] = _mm_unpacklo_epi16( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/ + output[1] = _mm_unpackhi_epi16( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/ + output[2] = _mm_unpacklo_epi16( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/ + output[3] = _mm_unpackhi_epi16( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/ + output[4] = _mm_unpacklo_epi16( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/ + output[5] = _mm_unpackhi_epi16( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/ + output[6] = _mm_unpacklo_epi16( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/ + output[7] = _mm_unpackhi_epi16( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/ + + } + + static inline void DCT_Transform ( short *x, short *y) { + __m128i *vx = (__m128i*) x; + __m128i *vy = (__m128i*) y; + + __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + + __m128i c13573 = _mm_splat_epi16(13573); + __m128i c21895 = _mm_splat_epi16(21895); + __m128i cNeg21895 = _mm_splat_epi16(-21895); + __m128i c23170 = _mm_splat_epi16(23170); + __m128i cNeg23170 = _mm_splat_epi16(-23170); + __m128i c6518 = _mm_splat_epi16(6518); + + t8 = _mm_adds_epi16(vx[0], vx[7]); + t9 = _mm_subs_epi16(vx[0], vx[7]); + t0 = _mm_adds_epi16(vx[1], vx[6]); + t7 = _mm_subs_epi16(vx[1], vx[6]); + t1 = _mm_adds_epi16(vx[2], vx[5]); + t6 = _mm_subs_epi16(vx[2], vx[5]); + t2 = _mm_adds_epi16(vx[3], vx[4]); + t5 = _mm_subs_epi16(vx[3], vx[4]); + + t3 = _mm_adds_epi16(t8, t2); + t4 = _mm_subs_epi16(t8, t2); + t2 = _mm_adds_epi16(t0, t1); + t8 = _mm_subs_epi16(t0, t1); + + t1 = _mm_adds_epi16(t7, t6); + t0 = _mm_subs_epi16(t7, t6); + + vy[0] = _mm_adds_epi16(t3, t2); + vy[4] = _mm_subs_epi16(t3, t2); + + vy[2] = _mm_mradds_epi16(t8, c13573, t4); + t10 = _mm_mr_epi16(t4, c13573); + + vy[6] = _mm_subs_epi16(t10, t8); + + t6 = _mm_mradds_epi16(t0, c23170, t5); + t7 = _mm_mradds_epi16(t0, cNeg23170, t5); + t2 = _mm_mradds_epi16(t1, cNeg23170, t9); + t3 = _mm_mradds_epi16(t1, c23170, t9); + + vy[1] = _mm_mradds_epi16(t6, c6518, t3); + t9 = _mm_mr_epi16(t3, c6518); + + vy[7] = _mm_subs_epi16(t9, t6); + vy[5] = _mm_mradds_epi16(t2, c21895, t7); + vy[3] = _mm_mradds_epi16(t7, cNeg21895, t2); + + } + + #define STORE(i) \ + outputv[i] = _mm_mradds_epi16(PostScalev[i], yv[i], _mm_splat_epi16(0)); + + void dct_vector(short *input, short *output, short *x, short *y) { + + __m128i *xv = (__m128i*) x; + __m128i *yv = (__m128i*) y; + __m128i *inputv = (__m128i*) input; + __m128i *outputv = (__m128i*) output; + __m128i *PostScalev = (__m128i*) PostScalePtr; + + xv[0] = inputv[0]; + xv[1] = inputv[1]; + xv[2] = inputv[2]; + xv[3] = inputv[3]; + xv[4] = inputv[4]; + xv[5] = inputv[5]; + xv[6] = inputv[6]; + xv[7] = inputv[7]; + + DCT_Transform( x, y ); + Matrix_Transpose( y, x ); + DCT_Transform( x, y ); + + STORE(0); + STORE(1); + STORE(2); + STORE(3); + STORE(4); + STORE(5); + STORE(6); + STORE(7); + + } + Index: llvm/examples/SIMD/DCT/dct.vectorc.c diff -c /dev/null llvm/examples/SIMD/DCT/dct.vectorc.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/DCT/dct.vectorc.c Sun Oct 23 17:49:39 2005 *************** *** 0 **** --- 1,140 ---- + #include "Scalar.h" + #include "VectorC.h" + #include "Intrinsics.h" + + // See the rgb2yuv benchmark for a description of USE_C0. For some + // reason, USE_C0 1 seems to be slightly *faster* on SSE! I'm + // investigating why this is. + // + #define USE_C0 1 + + short vllvm_adds_short(short,short); + + #define MERGE(out01, out0, out1, in0, in1) \ + short out01 = vllvm_fixed_vimm_short(0, 16); \ + out01 = _fixed_combine_short(out01, 16, in0, 8, 0, 1); \ + out01 = _fixed_combine_short(out01, 16, in1, 8, 8, 1); \ + short out0 = _extract_short(out01, 0, 2, 8); \ + short out1 = _extract_short(out01, 1, 2, 8) + + #define IN(x) \ + vllvm_load_short(input_scalar, 8, x) + + #define STORE(out, idx) \ + vllvm_store_short(out, output_scalar, idx) + + static inline void Matrix_Transpose_VectorC (short *input_scalar, short *output_scalar) { + MERGE(b01, b0, b1, IN(0), IN(4)); + MERGE(b23, b2, b3, IN(1), IN(5)); + MERGE(b45, b4, b5, IN(2), IN(6)); + MERGE(b67, b6, b7, IN(3), IN(7)); + + MERGE(a01, a0, a1, b0, b4); + MERGE(a23, a2, a3, b1, b5); + MERGE(a45, a4, a5, b2, b6); + MERGE(a67, a6, a7, b3, b7); + + MERGE(out01, out0, out1, a0, a4); + MERGE(out23, out2, out3, a1, a5); + MERGE(out45, out4, out5, a2, a6); + MERGE(out67, out6, out7, a3, a7); + + STORE(out0, 0); + STORE(out1, 1); + STORE(out2, 2); + STORE(out3, 3); + STORE(out4, 4); + STORE(out5, 5); + STORE(out6, 6); + STORE(out7, 7); + } + + static inline void DCT_Transform_VectorC ( short *x, short *y) { + signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + + t8 = vllvm_adds_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7)); + t9 = vllvm_subs_short(vllvm_load_short(x, 8, 0), vllvm_load_short(x, 8, 7)); + t0 = vllvm_adds_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6)); + t7 = vllvm_subs_short(vllvm_load_short(x, 8, 1), vllvm_load_short(x, 8, 6)); + t1 = vllvm_adds_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5)); + t6 = vllvm_subs_short(vllvm_load_short(x, 8, 2), vllvm_load_short(x, 8, 5)); + t2 = vllvm_adds_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4)); + t5 = vllvm_subs_short(vllvm_load_short(x, 8, 3), vllvm_load_short(x, 8, 4)); + + t3 = vllvm_adds_short(t8, t2); + t4 = vllvm_subs_short(t8, t2); + t2 = vllvm_adds_short(t0, t1); + t8 = vllvm_subs_short(t0, t1); + + t1 = vllvm_adds_short(t7, t6); + t0 = vllvm_subs_short(t7, t6); + + vllvm_store_short(vllvm_adds_short(t3, t2), y, 0); + vllvm_store_short(vllvm_subs_short(t3, t2), y, 4); + + short c13573 = vllvm_fixed_vimm_short(13573, 8); + #if USE_C0 + short c0 = vllvm_fixed_vimm_short(0, 8); + #endif + short c23170 = vllvm_fixed_vimm_short(23170, 8); + short cneg23170 = vllvm_fixed_vimm_short(-23170, 8); + short c6518 = vllvm_fixed_vimm_short(6518, 8); + + vllvm_store_short(vllvm_mradds_short(t8, c13573, t4), y, 2); + #if USE_C0 + t10 = vllvm_mradds_short(t4, c13573, c0); + #else + t10 = vllvm_mr_short(t4, c13573); + #endif + vllvm_store_short(vllvm_subs_short(t10, t8), y, 6); + + t6 = vllvm_mradds_short(t0, c23170, t5); + t7 = vllvm_mradds_short(t0, cneg23170, t5); + t2 = vllvm_mradds_short(t1, cneg23170, t9); + t3 = vllvm_mradds_short(t1, c23170, t9); + + vllvm_store_short(vllvm_mradds_short(t6, c6518, t3), y, 1); + #if USE_C0 + t9 = vllvm_mradds_short(t3, c6518, c0); + #else + t9 = vllvm_mr_short(t3, c6518); + #endif + vllvm_store_short(vllvm_subs_short(t9, t6), y, 7); + vllvm_store_short(vllvm_mradds_short(t2, vllvm_fixed_vimm_short(21895, 8), t7), y, 5); + vllvm_store_short(vllvm_mradds_short(t7, vllvm_fixed_vimm_short(-21895, 8), t2), y, 3); + + } + + extern short *PostScalePtr; + + #define STORE2(i) \ + vllvm_store_short(vllvm_mradds_short(vllvm_load_short(PostScalePtr, 8, i), \ + vllvm_load_short(y, 8, i), \ + vllvm_fixed_vimm_short(0, 8)), \ + output, i); + + void dct_vector(short *input, short *output, short *x, short *y) { + + vllvm_store_short(vllvm_load_short(input, 8, 0), x, 0); + vllvm_store_short(vllvm_load_short(input, 8, 1), x, 1); + vllvm_store_short(vllvm_load_short(input, 8, 2), x, 2); + vllvm_store_short(vllvm_load_short(input, 8, 3), x, 3); + vllvm_store_short(vllvm_load_short(input, 8, 4), x, 4); + vllvm_store_short(vllvm_load_short(input, 8, 5), x, 5); + vllvm_store_short(vllvm_load_short(input, 8, 6), x, 6); + vllvm_store_short(vllvm_load_short(input, 8, 7), x, 7); + DCT_Transform_VectorC( x, y ); + Matrix_Transpose_VectorC( y, x ); + DCT_Transform_VectorC( x, y ); + + STORE2(0); + STORE2(1); + STORE2(2); + STORE2(3); + STORE2(4); + STORE2(5); + STORE2(6); + STORE2(7); + + } + Index: llvm/examples/SIMD/DCT/main.c diff -c /dev/null llvm/examples/SIMD/DCT/main.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/DCT/main.c Sun Oct 23 17:49:39 2005 *************** *** 0 **** --- 1,175 ---- + #define N 1024 + + #include <stdio.h> + #include <stdlib.h> + #include <string.h> + #include <sys/time.h> + #include <sys/times.h> + #include "../_malloc.h" + #include "Scalar.h" + + inline void dct_scalar(short*, short*); + void dct_vector(short*, short*, short*, short*); + + short *in; + short *out_vector; + short *out_scalar; + + static short PostScaleArray[64] = { + 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681, + 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880, + 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422, + 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680, + 4095, 5681, 5351, 4816, 4095, 4816, 5351, 5681, + 4816, 6680, 6292, 5663, 4816, 5663, 6292, 6680, + 5351, 7422, 6992, 6292, 5351, 6292, 6992, 7422, + 5681, 7880, 7422, 6680, 5681, 6680, 7422, 7880 + }; + + short *PostScalePtr; + + void init() { + int i; + + // Force 16-byte alignment + // + in = (short*) _malloc(N*sizeof(short)); + out_vector = (short*) _malloc(N*sizeof(short)); + out_scalar = (short*) _malloc(N*sizeof(short)); + PostScalePtr = (short*) _malloc(64*sizeof(short)); + memcpy(PostScalePtr, PostScaleArray, 64*sizeof(short)); + + // Populate in with a range of values + // + for (i = 0; i < N; ++i) { + in[i] = N/2-i; + } + + } + + float run() { + long t0, t1, t2; + int i,j; + struct tms buf_s, buf_e; + long scalar_time = 0, vector_time = 0; + + times(&buf_s); + for (i = 0; i < 100000; ++i) + for (j = 0; j < N; j +=64) + dct_scalar(in+j, out_scalar+j); + times(&buf_e); + scalar_time = buf_e.tms_utime - buf_s.tms_utime; + printf("scalar time=%d, ", scalar_time); + + short *x = (short*) _malloc(64*sizeof(short)); + short *y = (short*) _malloc(64*sizeof(short)); + times(&buf_s); + for (i = 0; i < 100000; ++i) + for (j = 0; j < N; j +=64) + dct_vector(in+j, out_vector+j, x, y); + times(&buf_e); + vector_time = buf_e.tms_utime - buf_s.tms_utime; + printf("vector time=%d, ", vector_time); + + float speedup = ((float) scalar_time)/vector_time; + printf("speedup=%f\n", speedup); + + for (i = 0; i < N; i++) { + if (out_vector[i] != out_scalar[i]) { + printf("FAILED\n"); + exit(1); + } + } + + return speedup; + } + + int main (void) { + unsigned i; + + init(); + + float best = 0; + for (i = 0; i < NRUNS; ++i) { + float speedup = run(); + if (speedup > best) + best = speedup; + } + printf("best speedup=%f\n", best); + + printf ("PASSED\n"); + return 0; + + } + + static inline void Matrix_Transpose ( short *input, short *output) { + unsigned i; + for (i = 0; i < 8; ++i) { + output[i] = input[8*i]; + output[8+i] = input[8*i+1]; + output[16+i] = input[8*i+2]; + output[24+i] = input[8*i+3]; + output[32+i] = input[8*i+4]; + output[40+i] = input[8*i+5]; + output[48+i] = input[8*i+6]; + output[56+i] = input[8*i+7]; + } + } + + static inline void DCT_Transform ( short *x, short *y) { + signed short t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; + + unsigned i; + for (i = 0; i < 8; ++i) { + t8 = adds_short(x[i], x[56+i]); + t9 = subs_short(x[i], x[56+i]); + t0 = adds_short(x[8+i], x[48+i]); + t7 = subs_short(x[8+i], x[48+i]); + t1 = adds_short(x[16+i], x[40+i]); + t6 = subs_short(x[16+i], x[40+i]); + t2 = adds_short(x[24+i], x[32+i]); + t5 = subs_short(x[24+i], x[32+i]); + + t3 = adds_short(t8, t2); + t4 = subs_short(t8, t2); + t2 = adds_short(t0, t1); + t8 = subs_short(t0, t1); + + t1 = adds_short(t7, t6); + t0 = subs_short(t7, t6); + + y[i] = adds_short(t3, t2); + y[32+i] = subs_short(t3, t2); + y[16+i] = mradds_short(t8, 13573, t4); + t10 = mradds_short(t4, 13573, 0); + y[48+i] = subs_short(t10, t8); + + t6 = mradds_short(t0, 23170, t5); + t7 = mradds_short(t0, -23170, t5); + t2 = mradds_short(t1, -23170, t9); + t3 = mradds_short(t1, 23170, t9); + + y[8+i] = mradds_short(t6, 6518, t3); + t9 = mradds_short(t3, 6518, 0); + y[56+i] = subs_short(t9, t6); + y[40+i] = mradds_short(t2, 21895, t7); + y[24+i] = mradds_short(t7, -21895, t2); + } + + } + + void dct_scalar(short *input, short *output) { + + short x[64], y[64]; + unsigned i, j; + + memcpy(x, input, 64*sizeof(short)); + DCT_Transform( x, y ); + Matrix_Transpose( y, x ); + DCT_Transform( x, y ); + + for (i = 0; i < 8; ++i) + for (j = 0; j < 8; ++j) + output[8*i+j] = mradds_short(PostScaleArray[8*i+j], y[8*i+j], 0); + } + _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits