Changes in directory llvm/examples/SIMD/Transpose:
Makefile added (r1.1.2.1) main.c added (r1.1.2.1) transpose.altivec.handwritten.c added (r1.1.2.1) transpose.sse.handwritten.c added (r1.1.2.1) transpose.vectorc.c added (r1.1.2.1) --- Log message: Examples to illustrate Vector LLVM's SIMD support. --- Diffs of the changes: (+226 -0) Makefile | 4 + main.c | 98 ++++++++++++++++++++++++++++++++++++++++ transpose.altivec.handwritten.c | 44 +++++++++++++++++ transpose.sse.handwritten.c | 38 +++++++++++++++ transpose.vectorc.c | 42 +++++++++++++++++ 5 files changed, 226 insertions Index: llvm/examples/SIMD/Transpose/Makefile diff -c /dev/null llvm/examples/SIMD/Transpose/Makefile:1.1.2.1 *** /dev/null Sun Oct 23 17:50:00 2005 --- llvm/examples/SIMD/Transpose/Makefile Sun Oct 23 17:49:42 2005 *************** *** 0 **** --- 1,4 ---- + NAME= transpose + + include ../Makefile.common + Index: llvm/examples/SIMD/Transpose/main.c diff -c /dev/null llvm/examples/SIMD/Transpose/main.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:24 2005 --- llvm/examples/SIMD/Transpose/main.c Sun Oct 23 17:49:42 2005 *************** *** 0 **** --- 1,98 ---- + #define N 1024 + + #include <stdio.h> + #include <stdlib.h> + #include <sys/time.h> + #include <sys/times.h> + #include <assert.h> + #include "../_malloc.h" + + inline void transpose_scalar(short*, short*); + void transpose_vector(short*, short*); + + short *in; + short *out_vector; + short *out_scalar; + + void init() { + int i; + + // Force 16-byte alignment + // + in = (short*) _malloc(N*sizeof(short)); + out_vector = (short*) _malloc(N*sizeof(short)); + out_scalar = (short*) _malloc(N*sizeof(short)); + + // Populate in with a range of values + // + for (i = 0; i < N; ++i) { + in[i] = N/2-i; + } + + } + + float run() { + long t0, t1, t2; + int i,j; + struct tms buf_s, buf_e; + long scalar_time = 0, vector_time = 0; + + times(&buf_s); + for (i = 0; i < 1000000; ++i) + for (j = 0; j < N/64; ++j) + transpose_scalar(in+64*j, out_scalar+64*j); + times(&buf_e); + scalar_time = buf_e.tms_utime - buf_s.tms_utime; + printf("scalar time=%d, ", scalar_time); + + times(&buf_s); + for (i = 0; i < 1000000; ++i) + for (j = 0; j < N/64; ++j) + transpose_vector(in+64*j, out_vector+64*j); + times(&buf_e); + vector_time = buf_e.tms_utime - buf_s.tms_utime; + printf("vector time=%d, ", vector_time); + + float speedup = ((float) scalar_time)/vector_time; + printf("speedup=%f\n", speedup); + + for (i = 0; i < N; i++) { + if (out_vector[i] != out_scalar[i]) { + printf("FAILED\n"); + exit(1); + } + } + + return speedup; + } + + int + main (void) + { + unsigned i; + init(); + float best = 0; + for (i = 0; i < NRUNS; ++i) { + float speedup = run(); + if (speedup > best) + best = speedup; + } + printf("best speedup=%f\n", best); + + printf ("PASSED\n"); + return 0; + } + + void transpose_scalar ( short *input_scalar, short *output_scalar) { + unsigned i; + for (i = 0; i < 8; ++i) { + output_scalar[i] = input_scalar[8*i]; + output_scalar[8+i] = input_scalar[8*i+1]; + output_scalar[16+i] = input_scalar[8*i+2]; + output_scalar[24+i] = input_scalar[8*i+3]; + output_scalar[32+i] = input_scalar[8*i+4]; + output_scalar[40+i] = input_scalar[8*i+5]; + output_scalar[48+i] = input_scalar[8*i+6]; + output_scalar[56+i] = input_scalar[8*i+7]; + } + } Index: llvm/examples/SIMD/Transpose/transpose.altivec.handwritten.c diff -c /dev/null llvm/examples/SIMD/Transpose/transpose.altivec.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:24 2005 --- llvm/examples/SIMD/Transpose/transpose.altivec.handwritten.c Sun Oct 23 17:49:42 2005 *************** *** 0 **** --- 1,44 ---- + void print_vector(vector short v) { + unsigned i; + short *p = ((short*) &v); + for (i = 0; i < 8; ++i) + printf("%04X ", p[i]); + printf("\n"); + } + + inline void transpose_vector ( short *input_scalar, short *output_scalar) + { + vector signed short *input = (vector signed short*) input_scalar; + vector signed short *output = (vector signed short*) output_scalar; + + vector signed short a0, a1, a2, a3, a4, a5, a6, a7; + vector signed short b0, b1, b2, b3, b4, b5, b6, b7; + + b0 = vec_mergeh( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/ + b1 = vec_mergel( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/ + b2 = vec_mergeh( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/ + b3 = vec_mergel( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/ + b4 = vec_mergeh( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/ + b5 = vec_mergel( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/ + b6 = vec_mergeh( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/ + b7 = vec_mergel( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/ + + a0 = vec_mergeh( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/ + a1 = vec_mergel( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/ + a2 = vec_mergeh( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/ + a3 = vec_mergel( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/ + a4 = vec_mergeh( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/ + a5 = vec_mergel( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/ + a6 = vec_mergeh( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/ + a7 = vec_mergel( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/ + + output[0] = vec_mergeh( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/ + output[1] = vec_mergel( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/ + output[2] = vec_mergeh( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/ + output[3] = vec_mergel( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/ + output[4] = vec_mergeh( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/ + output[5] = vec_mergel( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/ + output[6] = vec_mergeh( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/ + output[7] = vec_mergel( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/ + + } Index: llvm/examples/SIMD/Transpose/transpose.sse.handwritten.c diff -c /dev/null llvm/examples/SIMD/Transpose/transpose.sse.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:24 2005 --- llvm/examples/SIMD/Transpose/transpose.sse.handwritten.c Sun Oct 23 17:49:42 2005 *************** *** 0 **** --- 1,38 ---- + #include "SSE.h" + + inline void transpose_vector ( short *input_scalar, short *output_scalar) + { + __m128i *input = (__m128i*) input_scalar; + __m128i *output = (__m128i*) output_scalar; + + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + + b0 = _mm_unpacklo_epi16( input[0], input[4] ); /* [ 00 40 01 41 02 42 03 43 ]*/ + b1 = _mm_unpackhi_epi16( input[0], input[4] ); /* [ 04 44 05 45 06 46 07 47 ]*/ + b2 = _mm_unpacklo_epi16( input[1], input[5] ); /* [ 10 50 11 51 12 52 13 53 ]*/ + b3 = _mm_unpackhi_epi16( input[1], input[5] ); /* [ 14 54 15 55 16 56 17 57 ]*/ + b4 = _mm_unpacklo_epi16( input[2], input[6] ); /* [ 20 60 21 61 22 62 23 63 ]*/ + b5 = _mm_unpackhi_epi16( input[2], input[6] ); /* [ 24 64 25 65 26 66 27 67 ]*/ + b6 = _mm_unpacklo_epi16( input[3], input[7] ); /* [ 30 70 31 71 32 72 33 73 ]*/ + b7 = _mm_unpackhi_epi16( input[3], input[7] ); /* [ 34 74 35 75 36 76 37 77 ]*/ + + a0 = _mm_unpacklo_epi16( b0, b4 ); /* [ 00 20 40 60 01 21 41 61 ]*/ + a1 = _mm_unpackhi_epi16( b0, b4 ); /* [ 02 22 42 62 03 23 43 63 ]*/ + a2 = _mm_unpacklo_epi16( b1, b5 ); /* [ 04 24 44 64 05 25 45 65 ]*/ + a3 = _mm_unpackhi_epi16( b1, b5 ); /* [ 06 26 46 66 07 27 47 67 ]*/ + a4 = _mm_unpacklo_epi16( b2, b6 ); /* [ 10 30 50 70 11 31 51 71 ]*/ + a5 = _mm_unpackhi_epi16( b2, b6 ); /* [ 12 32 52 72 13 33 53 73 ]*/ + a6 = _mm_unpacklo_epi16( b3, b7 ); /* [ 14 34 54 74 15 35 55 75 ]*/ + a7 = _mm_unpackhi_epi16( b3, b7 ); /* [ 16 36 56 76 17 37 57 77 ]*/ + + output[0] = _mm_unpacklo_epi16( a0, a4 ); /* [ 00 10 20 30 40 50 60 70 ]*/ + output[1] = _mm_unpackhi_epi16( a0, a4 ); /* [ 01 11 21 31 41 51 61 71 ]*/ + output[2] = _mm_unpacklo_epi16( a1, a5 ); /* [ 02 12 22 32 42 52 62 72 ]*/ + output[3] = _mm_unpackhi_epi16( a1, a5 ); /* [ 03 13 23 33 43 53 63 73 ]*/ + output[4] = _mm_unpacklo_epi16( a2, a6 ); /* [ 04 14 24 34 44 54 64 74 ]*/ + output[5] = _mm_unpackhi_epi16( a2, a6 ); /* [ 05 15 25 35 45 55 65 75 ]*/ + output[6] = _mm_unpacklo_epi16( a3, a7 ); /* [ 06 16 26 36 46 56 66 76 ]*/ + output[7] = _mm_unpackhi_epi16( a3, a7 ); /* [ 07 17 27 37 47 57 67 77 ]*/ + + } Index: llvm/examples/SIMD/Transpose/transpose.vectorc.c diff -c /dev/null llvm/examples/SIMD/Transpose/transpose.vectorc.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:24 2005 --- llvm/examples/SIMD/Transpose/transpose.vectorc.c Sun Oct 23 17:49:42 2005 *************** *** 0 **** --- 1,42 ---- + #include "VectorC.h" + + #define MERGE(out01, out0, out1, in0, in1) \ + short out01 = vllvm_fixed_vimm_short(0, 16); \ + out01 = vllvm_fixed_combine_short(out01, 16, in0, 8, 0, 1); \ + out01 = vllvm_fixed_combine_short(out01, 16, in1, 8, 8, 1); \ + short out0 = vllvm_extract_short(out01, 0, 2, 8); \ + short out1 = vllvm_extract_short(out01, 1, 2, 8) + + #define IN(x) \ + vllvm_load_short(input_scalar, 8, x) + + #define STORE(out, idx) \ + vllvm_store_short(out, output_scalar, idx) + + inline void transpose_vector (short *input_scalar, short *output_scalar) { + MERGE(b01, b0, b1, IN(0), IN(4)); + MERGE(b23, b2, b3, IN(1), IN(5)); + MERGE(b45, b4, b5, IN(2), IN(6)); + MERGE(b67, b6, b7, IN(3), IN(7)); + + MERGE(a01, a0, a1, b0, b4); + MERGE(a23, a2, a3, b1, b5); + MERGE(a45, a4, a5, b2, b6); + MERGE(a67, a6, a7, b3, b7); + + MERGE(out01, out0, out1, a0, a4); + MERGE(out23, out2, out3, a1, a5); + MERGE(out45, out4, out5, a2, a6); + MERGE(out67, out6, out7, a3, a7); + + STORE(out0, 0); + STORE(out1, 1); + STORE(out2, 2); + STORE(out3, 3); + STORE(out4, 4); + STORE(out5, 5); + STORE(out6, 6); + STORE(out7, 7); + + } + _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits