Changes in directory llvm/examples/SIMD/RGB2YUV:
Makefile added (r1.1.2.1) main.c added (r1.1.2.1) rgb2yuv.altivec.handwritten.c added (r1.1.2.1) rgb2yuv.sse.handwritten.c added (r1.1.2.1) rgb2yuv.vectorc.c added (r1.1.2.1) --- Log message: Examples to illustrate Vector LLVM's SIMD support. --- Diffs of the changes: (+384 -0) Makefile | 4 main.c | 148 +++++++++++++++++++++++++++ rgb2yuv.altivec.handwritten.c | 1 rgb2yuv.sse.handwritten.c | 230 ++++++++++++++++++++++++++++++++++++++++++ rgb2yuv.vectorc.c | 1 5 files changed, 384 insertions Index: llvm/examples/SIMD/RGB2YUV/Makefile diff -c /dev/null llvm/examples/SIMD/RGB2YUV/Makefile:1.1.2.1 *** /dev/null Sun Oct 23 17:50:00 2005 --- llvm/examples/SIMD/RGB2YUV/Makefile Sun Oct 23 17:49:41 2005 *************** *** 0 **** --- 1,4 ---- + NAME= rgb2yuv + + include ../Makefile.common + Index: llvm/examples/SIMD/RGB2YUV/main.c diff -c /dev/null llvm/examples/SIMD/RGB2YUV/main.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:17 2005 --- llvm/examples/SIMD/RGB2YUV/main.c Sun Oct 23 17:49:41 2005 *************** *** 0 **** --- 1,148 ---- + #define N 4800 + + #include <stdio.h> + #include <stdlib.h> + #include <sys/time.h> + #include <sys/times.h> + #include <assert.h> + #include "../_malloc.h" + + void rgb2yuv_scalar(unsigned char*, int, unsigned char*); + void rgb2yuv_vector(unsigned char*, int, unsigned char*); + + char *in; + char *out; + char *ref; + + void init() { + int i; + + // Force 16-byte alignment + // + in = (char*) _malloc(N*sizeof(char)); + out = (char*) _malloc(N*sizeof(char)); + ref = (char*) _malloc(N*sizeof(char)); + + // Populate in with a range of values + // + for (i = 0; i < N; ++i) { + in[i] = -(N/2)+i; + out[i] = 1; + ref[i] = 2; + } + + } + + void run(long *scalar_time, long *vector_time) { + long t0, t1, t2; + int i,j; + + struct tms buf_s, buf_e; + + times(&buf_s); + for (i = 0; i < 100000; ++i) + rgb2yuv_scalar(in, N, ref); + times(&buf_e); + *scalar_time = buf_e.tms_utime - buf_s.tms_utime; + printf("scalar time=%d, ", *scalar_time); + + times(&buf_s); + for (i = 0; i < 100000; ++i) + rgb2yuv_vector(in, N, out); + times(&buf_e); + *vector_time = buf_e.tms_utime - buf_s.tms_utime; + printf("vector time=%d, ", *vector_time); + + for (i = 0; i < N; i++) { + if (out[i] != ref[i]) { + printf("FAILED\n"); + exit(1); + } + } + + float speedup = ((float) *scalar_time) / *vector_time; + printf("speedup=%f\n", speedup); + + } + + int + main (void) { + unsigned i; + init(); + + long best_scalar = -1, best_vector = -1; + long scalar, vector; + for (i = 0; i < NRUNS; ++i) { + run (&scalar, &vector); + if (best_scalar < 0 || best_scalar > scalar) + best_scalar = scalar; + if (best_vector < 0 || best_vector > vector) + best_vector = vector; + } + + printf("best scalar=%d, ", best_scalar); + printf("best vector=%d, ", best_vector); + printf("speedup=%f\n", ((float) best_scalar)/best_vector); + printf ("PASSED\n"); + return 0; + } + + inline short saturate(int a) { + if (a > 32767) + return 32767; + if (a < -32768) + return -32768; + return a; + } + + inline short mradds(short a, short b, short c) { + int aint = a, bint = b, cint = c; + assert(((aint*bint)+(1<<14))>>15 == (((short)((aint*bint)>>14))+1)>>1); + return saturate(((aint*bint+(1 << 14)) >> 15) + cint); + } + + inline short adds(short a, short b) { + return saturate(a+b); + } + + inline unsigned char saturate_uchar(unsigned short a) { + if (a > 255) + return 255; + return a; + } + + void rgb2yuv_scalar(unsigned char *RGB_char_ptr, int RGB_size, + unsigned char *YCC_char_ptr) { + short red, green, blue; + short Y, Cb, Cr; + unsigned j, i; + + for (i = 0; i < RGB_size; i += 3*16) { + for (j = 0; j < 16; ++j) { + red = RGB_char_ptr[i+3*j]; + green = RGB_char_ptr[i+3*j+1]; + blue = RGB_char_ptr[i+3*j+2]; + + Y = mradds(red, 8432, 0); + Cb = mradds(red, -4818, 0); + Cr = mradds(red, 14345, 0); + + Y = mradds(green, 16425, Y); + Cb = mradds(green, -9527, Cb); + Cr = mradds(green, -12045, Cr); + + Y = mradds(blue, 3176, Y); + Cb = mradds(blue, 14345, Cb); + Cr = mradds(blue, -2300, Cr); + + Y = adds(Y, 16); + Cb = adds(Cb, 128); + Cr = adds(Cr, 128); + + YCC_char_ptr[i+j] = saturate_uchar(Y); + YCC_char_ptr[i+j+16] = saturate_uchar(Cb); + YCC_char_ptr[i+j+32] = saturate_uchar(Cr); + } + } + } + Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/RGB2YUV/rgb2yuv.altivec.handwritten.c Sun Oct 23 17:49:41 2005 *************** *** 0 **** --- 1 ---- + void rgb2yuv_vector(unsigned char *RGB_char_ptr, int RGB_size, unsigned char *YCC_char_ptr) { vector unsigned char *RGB_ptr = (vector unsigned char*) RGB_char_ptr; vector unsigned char *YCC_ptr = (vector unsigned char*) YCC_char_ptr; vector signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c0, c16, c128; vector unsigned char z0, tc0, tc1, tc2, tc3; vector signed short tr0, tr1, tg0, tg1, tb0, tb1, mask; vector signed short t0, t1, t2, t3, t4, t5; int i, j; vector unsigned char vPerm1 = (vector unsigned char)( 0, 3, 6, 9, 12, 15, 18, 21, /* R0..R7 */ 1, 4, 7, 10, 13, 16, 19, 22 /* G0..G7 */); vector unsigned char vPerm2 = (vector unsigned char)( 2, 5, 8, 11, 14, 17, 20, 23, /* B0..B7 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); vector unsigned char vPerm3 = (vector unsigned char)( 8, 11, 14, 17, 20, 23, 26, 29, /* R8..R15 */ 9 , 12, 15, 18, 21, 24, 27, 30 /* G8..G15 */); vector unsigned char vPerm4 = (vector unsigned char)(10, 13, 16, 19, 22, 25, 28, 31, /* B8..B15 */ 0, 0, 0, 0, 0, 0, 0, 0 /* dont care */); vector signed short vConst1 = (vector signed short)( 8432, 16425, 3176, -4818, -9527, 14345, 0, 0 ); vector signed short vConst2 = (vector signed short)( 14345, -12045, -2300, 16, 128, 0, 0, 0 ); r0 = vec_splat( vConst1, 0 ); /* 8432 */ g0 = vec_splat( vConst1, 1 ); /* 16425 */ b0 = vec_splat( vConst1, 2 ); /* 3176 */ r1 = vec_splat( vConst1, 3 ); /* -4818 */ g1 = vec_splat( vConst1, 4 ); /* -9527 */ b1 = vec_splat( vConst1, 5 ); /* 14345 */ r2 = vec_splat( vConst2, 0 ); /* 14345 */ g2 = vec_splat( vConst2, 1 ); /*-12045 */ b2 = vec_splat( vConst2, 2 ); /* -2300 */ c16 = vec_splat( vConst2, 3 ); /* 16 */ c128 = vec _splat( vConst2, 4 ); /* 128 */ c0 = (vector signed short) (0); /* 0 */ z0 = (vector unsigned char) (0); /* 0 */ mask = (vector signed short) (0x00FF); vector unsigned char Ys; vector unsigned char Cbs; vector unsigned char Crs; for ( i = 0; i < (RGB_size/sizeof(vector unsigned char)); i+=3 ) { tc0 = vec_perm( RGB_ptr[i], RGB_ptr[i+1], vPerm1 ); /* R0..R7 G0..G7 */ tc1 = vec_perm( RGB_ptr[i], RGB_ptr[i+1], vPerm2 ); /* B0..B7 */ tc2 = vec_perm( RGB_ptr[i+1], RGB_ptr[i+2], vPerm3 ); /* R8..R15 G8..G15 */ tc3 = vec_perm( RGB_ptr[i+1], RGB_ptr[i+2], vPerm4 ); /* B8..B15 */ tr0 = vec_and(vec_unpackh( (vector signed char) tc0 ), mask); /* tr0 = R0 .. R7 */ tg0 = vec_and(vec_unpackl( (vector signed char) tc0 ), mask); /* tg0 = G0 .. G7 */ tb0 = vec_and(vec_unpackh( (vector signed char) tc1 ), mask); /* tb0 = B0 .. B7 */ tr1 = vec_and(vec_unpackh( (vector signed char) tc2 ), mask); /* tr0 = R8 .. R15 */ tg1 = vec_and(vec_unpackl( (vector signed char) tc2 ), mask); /* tg0 = G8 .. G15 */ tb1 = vec_and(vec_unpackh( (vector signed char) tc3 ), mask); /* tb0 = B8 .. B15 */ t0 = vec_mradds( tr0, r0, c0 ); /* (R0 .. R7) * 8432 */ t1 = vec_mradds( tr0, r1, c0 ); /* (R0 .. R7) * -4818 */ t2 = vec_mradds( tr0, r2, c0 ); /* (R0 .. R7) * 14345 */ t0 = vec_mradds( tg0, g0, t0 ); /* += (G0 .. G7) * 16425 */ t1 = vec_mradds( tg0, g1, t1 ); /* += (G0 .. G7) * -9527 */ t2 = vec_mradds( tg0, g2, t2 ); /* += (G0 .. G7) * -12045 */ t0 = vec_mradds( tb0, b0, t0 ); /* += (B0 .. B7) * 3176 */ t1 = vec_mradds( tb0, b1, t1 ); /* += (B0 .. B7) * 14345 */ t2 = vec_mradds( tb0, b2, t2 ); /* += (B0 .. B7) * -2300 */ /* Convert the next three input vectors. */ t3 = vec_mradds( tr1, r0, c0 ); /* (R8 .. R15) * 8432 */ t4 = vec_mradds( tr1, r1, c0 ); /* (R8 .. R15) * -4818 */ t5 = vec_mradds( tr1, r2, c0 ); /* (R8 .. R15) * 14345 */ t3 = vec_mradds( tg1, g0, t3 ); /* += (G8 .. G15) * 16425 */ t4 = vec_mradds( tg1, g1, t4 ); /* += (G8 .. G15) * -9527 */ t5 = vec_mradds( tg1, g2, t5 ); /* += (G8 .. G15) * -12045 */ t3 = vec_mradds( tb1, b0, t3 ); /* += (B8 .. B15) * 3176 */ t4 = vec_mradds( tb1, b1, t4 ); /* += (B8 .. B15) * 14345 */ t5 = vec_mradds( tb1, b2, t5 ); /* += (B8 .. B15) * -2300 */ t0 = vec_adds( t0, c16 ); t3 = vec_adds( t3, c16 ); t1 = vec_adds( t1, c128 ); t4 = vec_adds( t4, c128 ); t2 = vec_adds( t2, c128 ); t5 = vec_adds( t5, c128 ); YCC_ptr[i] = vec_packsu( t0, t3 ); /* Y0 .. Y15 */ YCC_ptr[i+1] = vec_packsu( t1, t4 ); /* Cb0 .. Cb15 */ YCC_ptr[i+2] = vec_packsu( t2, t5 ); /* Cr0 .. Cr15 */ } } \ No newline at end of file Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/RGB2YUV/rgb2yuv.sse.handwritten.c Sun Oct 23 17:49:41 2005 *************** *** 0 **** --- 1,230 ---- + #include <emmintrin.h> + + #define VECTOR(x) *((__m128i*) &x) + #define CONSTANT(x) _mm_set_epi16(x,x,x,x,x,x,x,x) + + inline __m128i vec_mr(__m128i x, short y) { + __m128i const_1 = _mm_set_epi16(1,1,1,1,1,1,1,1); + __m128i y_vec = _mm_set_epi16(y, y, y, y,y, y, y, y); + __m128i tmp_hi = _mm_mulhi_epi16(x, y_vec); + __m128i tmp_lo = _mm_mullo_epi16(x, y_vec); + __m128i hi = _mm_slli_epi16(tmp_hi, 2); + __m128i lo = _mm_srli_epi16(tmp_lo, 14); + __m128i tmp_vec = _mm_or_si128(hi, lo); + tmp_vec = _mm_add_epi16(tmp_vec, const_1); + tmp_vec = _mm_srai_epi16(tmp_vec, 1); + return tmp_vec; + } + + inline __m128i vec_mradds(__m128i x, short y, __m128i z) { + return _mm_adds_epi16(vec_mr(x,y),z); + } + + #define MRADDS(x,y,z) _mm_adds_epi16(vec_mr(x,y),z) + + void print_quaternary(unsigned char ch) { + unsigned i; + for (i = 0; i < 4; ++i) + printf("%d ", (ch >> (2*i)) & 3); + printf("\n"); + } + + void print_vector_128(__m128i vec) { + __m128i tmp = vec; + unsigned char *p = (unsigned char*) &tmp; + unsigned i; + for (i = 0; i < 16; ++i) + printf("%02X ", p[i]); + printf("\n"); + } + + #define idx(idx0, idx1, idx2, idx3) \ + idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6) + + #define extract(source, idx0, idx1, idx2, idx3) \ + _mm_shuffle_epi32(source, idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6)) \ + + #define mask(source, idx0, idx1, idx2, idx3) \ + _mm_and_si128(source, _mm_set_epi32(idx3 * ~0U, idx2 * ~0U, idx1 * ~0U, idx0 * ~0U)) + + #define msk(idx0, idx1, idx2, idx3) \ + _mm_set_epi32(idx3 * ~0U, idx2 * ~0U, idx1 * ~0U, idx0 * ~0U) + + void rgb2yuv_vector(unsigned char *RGB_char_ptr, int RGB_size, + unsigned char *YCC_char_ptr) { + + __m128i* RGB_ptr = (__m128i*) RGB_char_ptr; + __m128i zero = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); + __m128i constant_16 = CONSTANT(16); + __m128i constant_128 = CONSTANT(128); + + unsigned j, i; + + __m128i in0123, in01, in0, in1, in23, in2, in3; + __m128i in4567, in45, in4, in5, in67, in6, in7; + __m128i in89AB, in89, in8, in9, inAB, inA, inB; + + __m128i red0, red1, red2, red_lo, red_hi; + __m128i green0, green1, green_lo, green_hi; + __m128i blue0, blue1, blue_lo, blue_hi; + + __m128i red_16_lo, red_16_hi; + __m128i green_16_lo, green_16_hi; + __m128i blue_16_lo, blue_16_hi; + + __m128i Y_lo, Y_hi; + __m128i Cb_lo, Cb_hi; + __m128i Cr_lo, Cr_hi; + + __m128i Ys_char, Cbs_char, Crs_char; + + for (i = 0; i < RGB_size; i += 3*16) { + in0123 = RGB_ptr[i/16]; + in01 = _mm_unpacklo_epi8(in0123, zero); + in0 = _mm_unpacklo_epi16(in01, zero); + in1 = _mm_unpackhi_epi16(in01, zero); + in23 = _mm_unpackhi_epi8(in0123, zero); + in2 = _mm_unpacklo_epi16(in23, zero); + in3 = _mm_unpackhi_epi16(in23, zero); + in4567 = RGB_ptr[i/16+1]; + in45 = _mm_unpacklo_epi8(in4567, zero); + in4 = _mm_unpacklo_epi16(in45, zero); + in5 = _mm_unpackhi_epi16(in45, zero); + in67 = _mm_unpackhi_epi8(in4567, zero); + in6 = _mm_unpacklo_epi16(in67, zero); + in7 = _mm_unpackhi_epi16(in67, zero); + in89AB = RGB_ptr[i/16+2]; + in89 = _mm_unpacklo_epi8(in89AB, zero); + in8 = _mm_unpacklo_epi16(in89, zero); + in9 = _mm_unpackhi_epi16(in89, zero); + inAB = _mm_unpackhi_epi8(in89AB, zero); + inA = _mm_unpacklo_epi16(inAB, zero); + inB = _mm_unpackhi_epi16(inAB, zero); + + red0 = _mm_and_si128(_mm_shuffle_epi32(in0, idx(0,3,0,0)), msk(1,1,0,0)); + red1 = _mm_and_si128(in1, msk(0,0,1,0)); + red2 = _mm_and_si128(_mm_shuffle_epi32(in2, idx(0,0,0,1)), msk(0,0,0,1)); + red_lo = _mm_or_si128(_mm_or_si128(red0, red1), red2); + + red0 = _mm_and_si128(_mm_shuffle_epi32(in3, idx(0,3,0,0)), msk(1,1,0,0)); + red1 = _mm_and_si128(in4, msk(0,0,1,0)); + red2 = _mm_and_si128(_mm_shuffle_epi32(in5, idx(0,0,0,1)), msk(0,0,0,1)); + red_hi = _mm_or_si128(_mm_or_si128(red0, red1), red2); + red_16_lo = _mm_packs_epi32(red_lo, red_hi); + + red0 = _mm_and_si128(_mm_shuffle_epi32(in6, idx(0,3,0,0)), msk(1,1,0,0)); + red1 = _mm_and_si128(in7, msk(0,0,1,0)); + red0 = _mm_or_si128(red0, red1); + red1 = _mm_and_si128(_mm_shuffle_epi32(in8, idx(0,0,0,1)), msk(0,0,0,1)); + red_lo = _mm_or_si128(red0, red1); + + red0 = _mm_and_si128(_mm_shuffle_epi32(in9, idx(0,3,0,0)), msk(1,1,0,0)); + red1 = _mm_and_si128(inA, msk(0,0,1,0)); + red0 = _mm_or_si128(red0, red1); + red1 = _mm_and_si128(_mm_shuffle_epi32(inB, idx(0,0,0,1)), msk(0,0,0,1)); + red_hi = _mm_or_si128(red0, red1); + red_16_hi = _mm_packs_epi32(red_lo, red_hi); + + green0 = mask(extract(in0, 1,0,0,0), 1, 0, 0, 0); + green1 = mask(extract(in1, 0, 0, 3, 0), 0, 1, 1, 0); + green0 = _mm_or_si128(green0, green1); + green1 = mask(extract(in2, 0, 0, 0, 2), 0, 0, 0, 1); + green_lo = _mm_or_si128(green0, green1); + + green0 = mask(extract(in3, 1,0,0,0), 1, 0, 0, 0); + green1 = mask(extract(in4, 0, 0, 3, 0), 0, 1, 1, 0); + green0 = _mm_or_si128(green0, green1); + green1 = mask(extract(in5, 0, 0, 0, 2), 0, 0, 0, 1); + green_hi = _mm_or_si128(green0, green1); + green_16_lo = _mm_packs_epi32(green_lo, green_hi); + + green0 = mask(extract(in6, 1,0,0,0), 1, 0, 0, 0); + green1 = mask(extract(in7, 0, 0, 3, 0), 0, 1, 1, 0); + green0 = _mm_or_si128(green0, green1); + green1 = mask(extract(in8, 0, 0, 0, 2), 0, 0, 0, 1); + green_lo = _mm_or_si128(green0, green1); + + green0 = mask(extract(in9, 1,0,0,0), 1, 0, 0, 0); + green1 = mask(extract(inA, 0, 0, 3, 0), 0, 1, 1, 0); + green0 = _mm_or_si128(green0, green1); + green1 = mask(extract(inB, 0, 0, 0, 2), 0, 0, 0, 1); + green_hi = _mm_or_si128(green0, green1); + green_16_hi = _mm_packs_epi32(green_lo, green_hi); + + blue0 = mask(extract(in0, 2,0,0,0), 1, 0, 0, 0); + blue1 = mask(extract(in1, 0, 1, 0, 0), 0, 1, 0, 0); + blue0 = _mm_or_si128(blue0, blue1); + blue1 = mask(extract(in2, 0, 0, 0, 3), 0, 0, 1, 1); + blue_lo = _mm_or_si128(blue0, blue1); + + blue0 = mask(extract(in3, 2,0,0,0), 1, 0, 0, 0); + blue1 = mask(extract(in4, 0, 1, 0, 0), 0, 1, 0, 0); + blue0 = _mm_or_si128(blue0, blue1); + blue1 = mask(extract(in5, 0, 0, 0, 3), 0, 0, 1, 1); + blue_hi = _mm_or_si128(blue0, blue1); + blue_16_lo = _mm_packs_epi32(blue_lo, blue_hi); + + blue0 = mask(extract(in6, 2,0,0,0), 1, 0, 0, 0); + blue1 = mask(extract(in7, 0, 1, 0, 0), 0, 1, 0, 0); + blue0 = _mm_or_si128(blue0, blue1); + blue1 = mask(extract(in8, 0, 0, 0, 3), 0, 0, 1, 1); + blue_lo = _mm_or_si128(blue0, blue1); + + blue0 = mask(extract(in9, 2,0,0,0), 1, 0, 0, 0); + blue1 = mask(extract(inA, 0, 1, 0, 0), 0, 1, 0, 0); + blue0 = _mm_or_si128(blue0, blue1); + blue1 = mask(extract(inB, 0, 0, 0, 3), 0, 0, 1, 1); + blue_hi = _mm_or_si128(blue0, blue1); + blue_16_hi = _mm_packs_epi32(blue_lo, blue_hi); + + Y_lo = vec_mr(red_16_lo, 8432); + Y_hi = vec_mr(red_16_hi, 8432); + + Cb_lo = vec_mr(red_16_lo, -4818); + Cb_hi = vec_mr(red_16_hi, -4818); + + Cr_lo = vec_mr(red_16_lo, 14345); + Cr_hi = vec_mr(red_16_hi, 14345); + + Y_lo = vec_mradds(green_16_lo, 16425, Y_lo); + Y_hi = vec_mradds(green_16_hi, 16425, Y_hi); + + Cb_lo = vec_mradds(green_16_lo, -9527, Cb_lo); + Cb_hi = vec_mradds(green_16_hi, -9527, Cb_hi); + + Cr_lo = vec_mradds(green_16_lo, -12045, Cr_lo); + Cr_hi = vec_mradds(green_16_hi, -12045, Cr_hi); + + Y_lo = vec_mradds(blue_16_lo, 3176, Y_lo); + Y_hi = vec_mradds(blue_16_hi, 3176, Y_hi); + + Cb_lo = vec_mradds(blue_16_lo, 14345, Cb_lo); + Cb_hi = vec_mradds(blue_16_hi, 14345, Cb_hi); + + Cr_lo = vec_mradds(blue_16_lo, -2300, Cr_lo); + Cr_hi = vec_mradds(blue_16_hi, -2300, Cr_hi); + + Y_lo = _mm_adds_epi16(Y_lo, constant_16); + Y_hi = _mm_adds_epi16(Y_hi, constant_16); + + Cb_lo = _mm_adds_epi16(Cb_lo, constant_128); + Cb_hi = _mm_adds_epi16(Cb_hi, constant_128); + + Cr_lo = _mm_adds_epi16(Cr_lo, constant_128); + Cr_hi = _mm_adds_epi16(Cr_hi, constant_128); + + Ys_char = _mm_packus_epi16(Y_lo, Y_hi); + Cbs_char = _mm_packus_epi16(Cb_lo, Cb_hi); + Crs_char = _mm_packus_epi16(Cr_lo, Cr_hi); + + for (j = 0; j < 16; ++j) { + YCC_char_ptr[i+3*j] = ((unsigned char*) &Ys_char)[j]; + YCC_char_ptr[i+1+3*j] = ((unsigned char*) &Cbs_char)[j]; + YCC_char_ptr[i+2+3*j] = ((unsigned char*) &Crs_char)[j]; + } + } + + malloc(0); + + } Index: llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c diff -c /dev/null llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/RGB2YUV/rgb2yuv.vectorc.c Sun Oct 23 17:49:41 2005 *************** *** 0 **** --- 1 ---- + #include "VectorC.h" #include "Intrinsics.h" // Selects whether to use vllvm_mradds with third argument c0 // (constant 0), or vllvm_mr instead. Using c0 may be slightly faster // on AltiVec. Not using c0 saves an "adds x, 0" op and may be // slightly faster on architectures (e.g., SSE2) that don't support // mradds as a single op. Smarter code generation can make USE_C0 0 // just as fast on AltiVec as USE_C0 1. // #define USE_C0 1 // Note that both vllvm_mr and vllvm_mradds are defined in // "Intrinsics.h" and expand to patterns of more primitive Vector LLVM // instructions // void rgb2yuv_vector(unsigned char *RGB_ptr, int RGB_size, unsigned char *YCC_ptr) { signed short r0, r1, r2, g0, g1, g2, b0, b1, b2, c16, c128; unsigned char z0, tc0, tc1, tc2, tc3; signed short tr0, tr1, tg0, tg1, tb0, tb1; signed short t0, t1, t2, t3, t4, t5; int i; unsigned char vPerm1 = vllvm_constant_unsigned_char( 0, 3, 6, 9, 12, 15, 18, 21, 1, 4, 7, 10, 13, 16, 19, 22); unsigned char vPerm2 = vllvm_constant_unsigned_char( 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0); unsigned char vPerm3 = vllvm_constant_unsigned_char( 8, 11, 14, 17, 20, 23, 26, 29, 9, 12, 15, 18, 21, 24, 27, 30); unsigned char vPerm4 = vllvm_constant_unsigned_char(10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0, 0, 0, 0); r0 = vllvm_fixed_vimm_short(8432, 8); g0 = vllvm_fixed_vimm_short(16425, 8); b0 = vllvm_fixed_vimm_short(3176, 8); r1 = vllvm_fixed_vimm_short(-4818, 8); g1 = vllvm_fixed_vimm_short(-9527, 8); b1 = vllvm_fixed_vimm_short(14345, 8); r2 = vllvm_fixed_vimm_short(14345, 8); g2 = vllvm_fixed_vimm_short(-12045, 8); b2 = vllvm_fixed_vimm_short(-2300, 8); c16 = vllvm_fixed_vimm_short(16, 8); c128 = vllvm_fixed_vimm_short(128, 8); #if USE_C0 signed short c0 = vllvm_fixed_vimm_short(0, 8); #endif for ( i = 0; i < (RGB_size/16); i+=3 ) { unsigned char v0 = v llvm_load_unsigned_char(RGB_ptr, 16, i); unsigned char v1 = vllvm_load_unsigned_char(RGB_ptr, 16, i+1); unsigned char v2 = vllvm_load_unsigned_char(RGB_ptr, 16, i+2); char tmp = vllvm_fixed_vimm_char(0, 32); tmp = vllvm_fixed_combine_unsigned_char(tmp, 32, v0, 16, 0, 1); tmp = vllvm_fixed_combine_unsigned_char(tmp, 32, v1, 16, 16, 1); tc0 = vllvm_fixed_permute_unsigned_char(tmp, 32, vPerm1, 16); char tmp1 = vllvm_fixed_vimm_char(0, 32); tmp1 = vllvm_fixed_combine_unsigned_char(tmp1, 32, v0, 16, 0, 1); tmp1 = vllvm_fixed_combine_unsigned_char(tmp1, 32, v1, 16, 16, 1); tc1 = vllvm_fixed_permute_unsigned_char(tmp1, 32, vPerm2, 16); char tmp2 = vllvm_fixed_vimm_char(0, 32); tmp2 = vllvm_fixed_combine_unsigned_char(tmp2, 32, v1, 16, 0, 1); tmp2 = vllvm_fixed_combine_unsigned_char(tmp2, 32, v2, 16, 16, 1); tc2 = vllvm_fixed_permute_unsigned_char(tmp2, 32, vPerm3, 16); char tmp3 = vllvm_fixed_vimm_char(0, 32); tmp3 = vllvm_fixed_combine_unsigned_char(tmp3, 32, v1, 16, 0, 1); tmp3 = vllvm_fixed_combine_unsigned_char(tmp3, 32, v2, 16, 16, 1); tc3 = vllvm_fixed_permute_unsigned_char(tmp3, 32, vPerm4, 16); tr0 = _extract_unsigned_char(tc0, 0, 1, 8); tg0 = _extract_unsigned_char(tc0, 8, 1, 8); tb0 = _extract_unsigned_char(tc1, 0, 1, 8); tr1 = _extract_unsigned_char(tc2, 0, 1, 8); tg1 = _extract_unsigned_char(tc2, 8, 1, 8); tb1 = _extract_unsigned_char(tc3, 0, 1, 8); #if USE_C0 t0 = vllvm_mradds_short( tr0, r0, c0 ); t1 = vllvm_mradds_short( tr0, r1, c0 ); t2 = vllvm_mradds_short( tr0, r2, c0 ); #else t0 = vllvm_mr_short( tr0, r0 ); t1 = vllvm_mr_short( tr0, r1 ); t2 = vllvm_mr_short( tr0, r2 ); #endif t0 = vllvm_mradds_short( tg0, g0, t0 ); t1 = vllvm_mradds_short( tg0, g1, t1 ); t2 = vllvm_mradds_short( tg0, g2, t2 ); t0 = vllvm_mradds_short( tb0, b0, t0 ); t1 = vllvm_mradds_short( tb0, b1, t1 ); t2 = vllv m_mradds_short( tb0, b2, t2 ); #if USE_C0 t3 = vllvm_mradds_short( tr1, r0, c0 ); t4 = vllvm_mradds_short( tr1, r1, c0 ); t5 = vllvm_mradds_short( tr1, r2, c0 ); #else t3 = vllvm_mr_short( tr1, r0 ); t4 = vllvm_mr_short( tr1, r1 ); t5 = vllvm_mr_short( tr1, r2 ); #endif t3 = vllvm_mradds_short( tg1, g0, t3 ); t4 = vllvm_mradds_short( tg1, g1, t4 ); t5 = vllvm_mradds_short( tg1, g2, t5 ); t3 = vllvm_mradds_short( tb1, b0, t3 ); t4 = vllvm_mradds_short( tb1, b1, t4 ); t5 = vllvm_mradds_short( tb1, b2, t5 ); t0 = vllvm_adds_short( t0, c16 ); t3 = vllvm_adds_short( t3, c16 ); t1 = vllvm_adds_short( t1, c128 ); t4 = vllvm_adds_short( t4, c128 ); t2 = vllvm_adds_short( t2, c128 ); t5 = vllvm_adds_short( t5, c128 ); short out0 = vllvm_fixed_vimm_short(0, 16); short out1 = vllvm_fixed_combine_short(out0, 16, t0, 8, 0, 1); short out2 = vllvm_fixed_combine_short(out1, 16, t3, 8, 8, 1); unsigne d char out3 = vllvm_saturate_short_uchar(out2); vllvm_store_unsigned_char(out3, YCC_ptr, i); out1 = vllvm_fixed_combine_short(out0, 16, t1, 8, 0, 1); out2 = vllvm_fixed_combine_short(out1, 16, t4, 8, 8, 1); out3 = vllvm_saturate_short_uchar(out2); vllvm_store_unsigned_char(out3, YCC_ptr, i+1); out1 = vllvm_fixed_combine_short(out0, 16, t2, 8, 0, 1); out2 = vllvm_fixed_combine_short(out1, 16, t5, 8, 8, 1); out3 = vllvm_saturate_short_uchar(out2); vllvm_store_unsigned_char(out3, YCC_ptr, i+2); } } \ No newline at end of file _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits