Changes in directory llvm/examples/SIMD/InterQuant:
Makefile added (r1.1.2.1) interquant.altivec.handwritten.c added (r1.1.2.1) interquant.sse.handwritten.c added (r1.1.2.1) interquant.vectorc.c added (r1.1.2.1) main.c added (r1.1.2.1) --- Log message: Examples to illustrate Vector LLVM's SIMD support. --- Diffs of the changes: (+201 -0) Makefile | 4 + interquant.altivec.handwritten.c | 1 interquant.sse.handwritten.c | 40 +++++++++++++ interquant.vectorc.c | 44 +++++++++++++++ main.c | 112 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 201 insertions Index: llvm/examples/SIMD/InterQuant/Makefile diff -c /dev/null llvm/examples/SIMD/InterQuant/Makefile:1.1.2.1 *** /dev/null Sun Oct 23 17:50:00 2005 --- llvm/examples/SIMD/InterQuant/Makefile Sun Oct 23 17:49:40 2005 *************** *** 0 **** --- 1,4 ---- + NAME= interquant + + include ../Makefile.common + Index: llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:17 2005 --- llvm/examples/SIMD/InterQuant/interquant.altivec.handwritten.c Sun Oct 23 17:49:40 2005 *************** *** 0 **** --- 1 ---- + /*************************************************************** * * Copyright: (c) Copyright Motorola Inc. 1998 * * Date: May 18, 1998 * * Function: INTER_Quantization * * Description: The INTER_QUANTIZATION routine will quantize * the predictive frames (P-picture). Coefficients * are quantized to the formula: * C' = sign(C) * ( abs(C) - QP/2 ) / ( 2 * QP ). * To ensure ( abs(C) - QP/2 ) is positive, saturating * unsigned subtraction is used. * * Inputs: input - Pointer to input data (short), which * must be between -2040 and 2040 (as set * up by DCT ). It is assumed that the allocated * array has been 128-bit aligned and contains * 8x8 short elements. * * Outputs: output - Pointer to output area for the transfored * data. The output values are between -127 * and 127. It is assumed that a 128-bit * aligned 8x8 array of signed char has been * pre-allocated. * * QP: QP (quantization parameter?) ranges from 1 to 31 * **************************************************************/ #define INTER_CALC( input, output ) \ t1 = vec_subs( zero, input);\ u1 = (vector unsigned short ) vec_max( input, t1 ); /* ( abs(C)) */ \ t2 = (vector signed short ) vec_subs( u1, qpd2 );/*max(0,(abs(C)-QP/2)) */ \ t3 = vec_madds( t2, dtqp.v, zero ); /* ( (abs(C)-QP/2)/(2*QP) )>>15 ) */ \ t4 = vec_min(maxq,t3); /* peg value at 127 if greater */ \ msk = vec_cmpgt( zero, input ); /* select to find sign of input */ \ t5 = vec_subs( zero, t4 );\ output = vec_sel( t4, t5, msk ); /* ensure result is same sign */ void interquant_vector ( signed short* in, signed char* out, int QP ) { vect or signed short* input = (vector signed short*) in; vector signed char* output = (vector signed char*) out; /* ensure alignment so calculated constant can be propagated into entire vector for calculations */ union{ vector signed short v; signed short s[8]; } dtqp; vector signed short zero, minus1, maxq, parta, partb; vector signed short t1, t2, t3, t4, t5; /* used in macros */ vector unsigned short qpd2, u1; vector bool short msk; /* load the calculated constant into the vector */ dtqp.s[0] = (signed short)((int)((32768+QP)/(2*QP))); dtqp.s[1] = (signed short)(QP/2); qpd2 = (vector unsigned short) vec_splat( dtqp.v, 1); dtqp.v = vec_splat( dtqp.v, 0 ); /* load the static constants used in the macros */ zero = (vector signed short) (0); maxq = (vector signed short) (127); minus1 = (vector signed short) (-1); /* for all input compute: C' = sign(C) * ( (abs(C)-(QP /2) ) / 2*QP ) */ INTER_CALC( input[0], parta ); INTER_CALC( input[1], partb ); output[0] = vec_pack( parta, partb ); INTER_CALC( input[2], parta ); INTER_CALC( input[3], partb ); output[1] = vec_pack( parta, partb ); INTER_CALC( input[4], parta ); INTER_CALC( input[5], partb ); output[2] = vec_pack( parta, partb ); INTER_CALC( input[6], parta ); INTER_CALC( input[7], partb ); output[3] = vec_pack( parta, partb ); } \ No newline at end of file Index: llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/InterQuant/interquant.sse.handwritten.c Sun Oct 23 17:49:40 2005 *************** *** 0 **** --- 1,40 ---- + #include "SSE.h" + + void interquant_vector ( signed short* in, + signed char* out, + int qp) { + int i, j, k; + short dtqp = (32768+qp)/(2*qp); + __m128i dtqp_vec = _mm_splat_epi16(dtqp); + __m128i zero = _mm_splat_epi16(0); + __m128i qpd2 = _mm_splat_epi16(qp/2); + __m128i maxq = _mm_splat_epi16(127); + __m128i *in_vp = (__m128i*) in; + __m128i *out_vp = (__m128i*) out; + __m128i result[2]; + + for (i = 0; i < 4; ++i) { + for (j = 0; j < 2; ++j) { + __m128i input = *in_vp++; + __m128i t1 = _mm_subs_epi16(zero, input); + __m128i u1 = _mm_max_epi16(input, t1); + __m128i t2 = _mm_subs_epu16(u1, qpd2); + + // unsigned tmp = (unsigned) t2 * (unsigned) dtqp_vec + __m128i tmp_hi = _mm_mulhi_epi16(t2, dtqp_vec); + __m128i tmp_lo = _mm_mullo_epi16(t2, dtqp_vec); + + // short t3 = tmp >> 15 + __m128i hi = _mm_slli_epi16(tmp_hi, 1); + __m128i lo = _mm_srli_epi16(tmp_lo, 15); + __m128i t3 = _mm_or_si128(hi, lo); + + __m128i t4 = _mm_min_epi16(maxq, t3); + __m128i mask = _mm_cmpgt_epi16(zero, input); + __m128i neg = _mm_subs_epi16(zero, t4); + result[j] = _mm_select_si128(mask, neg, t4); + } + *out_vp++ = _mm_pack_epi16(result[0], result[1]); + } + } + Index: llvm/examples/SIMD/InterQuant/interquant.vectorc.c diff -c /dev/null llvm/examples/SIMD/InterQuant/interquant.vectorc.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/InterQuant/interquant.vectorc.c Sun Oct 23 17:49:40 2005 *************** *** 0 **** --- 1,44 ---- + #include "VectorC.h" + #include "Intrinsics.h" + + void interquant_vector(signed short* in, signed char* out, int qp ) { + int i, j; + + short part1, part2; + short t1, t2, t3, t4, t5; + unsigned short u1; + short msk; + + unsigned short qpd2 = vllvm_fixed_vimm_short((short) qp/2, 8); + short v = vllvm_fixed_vimm_short((short)((int)((32768+qp)/(2*qp))), 8); + + short zero = vllvm_fixed_vimm_short(0, 8); + short maxq = vllvm_fixed_vimm_short(127, 8); + + for (i = 0; i < 4; ++i) { + short in_vec = vllvm_load_short(in, 8, 2*i); + t1 = vllvm_subs_short( zero, in_vec); + u1 = (unsigned short) vllvm_max_short( in_vec, t1 ); + t2 = vllvm_subs_ushort( u1, qpd2 ); + t3 = t2*v >> 15; + t4 = vllvm_min_short(maxq,t3); + msk = zero > in_vec; + t5 = vllvm_subs_short( zero, t4 ); + part1 = vllvm_vselect_short(msk, t5, t4); + + in_vec = vllvm_load_short(in, 8, 2*i+1); + t1 = vllvm_subs_short( zero, in_vec); + u1 = (unsigned short) vllvm_max_short( in_vec, t1 ); + t2 = (short) vllvm_subs_ushort( u1, qpd2 ); + t3 = (t2*v) >> 15; + t4 = vllvm_min_short(maxq,t3); + msk = zero > in_vec; + t5 = vllvm_subs_short( zero, t4 ); + part2 = vllvm_vselect_short(msk, t5, t4); + + short out_vec = vllvm_fixed_vimm_short(0, 16); + out_vec = vllvm_fixed_combine_short(out_vec, 16, part1, 8, 0, 1); + out_vec = vllvm_fixed_combine_short(out_vec, 16, part2, 8, 8, 1); + vllvm_store_char(out_vec, out, i); + } + } Index: llvm/examples/SIMD/InterQuant/main.c diff -c /dev/null llvm/examples/SIMD/InterQuant/main.c:1.1.2.1 *** /dev/null Sun Oct 23 17:50:18 2005 --- llvm/examples/SIMD/InterQuant/main.c Sun Oct 23 17:49:40 2005 *************** *** 0 **** --- 1,112 ---- + #define N 1024 //2048*2 + #define MAX_QP 31 + + #include <stdio.h> + #include <stdlib.h> + #include <sys/time.h> + #include <sys/times.h> + #include "../_malloc.h" + + void interquant_scalar(short*,signed char*,int); + void interquant_vector(short*,signed char*,int); + + short *in; + char *vector; + char *scalar; + + void init() { + int i; + + // Force 16-byte alignment + // + in = (short*) _malloc(N*sizeof(short)); + vector = (char*) _malloc(N*sizeof(short)); + scalar = (char*) _malloc(N*sizeof(short)); + + // Populate in with a range of values + // + for (i = 0; i < N; ++i) { + in[i] = -(N/2)+i; + } + + } + + void run(long *scalar_time, long *vector_time) { + long t0, t1, t2; + int i,j; + int qp = 10; + struct tms buf_s, buf_e; + + init(); + + times(&buf_s); + for (j = 0; j < 100000; ++j) + for (i = 0; i < N/64; ++i) + interquant_scalar(in+64*i, scalar+64*i, qp); + times(&buf_e); + + *scalar_time = buf_e.tms_utime - buf_s.tms_utime; + printf("scalar time=%d, ", *scalar_time); + + times(&buf_s); + for (j = 0; j < 100000; ++j) + for (i = 0; i < N/64; ++i) + interquant_vector(in+64*i, vector+64*i, qp); + times(&buf_e); + + *vector_time = buf_e.tms_utime - buf_s.tms_utime; + printf("vector time=%d, ", *vector_time); + + for (i = 0; i < N; i++) { + if (vector[i] != scalar[i]) { + printf("FAILED\n"); + exit(1); + } + } + + float speedup = ((float) *scalar_time) / *vector_time; + printf("speedup=%f\n", speedup); + + } + + int + main (void) { + unsigned i; + init(); + + long best_scalar = -1, best_vector = -1; + long scalar, vector; + for (i = 0; i < NRUNS; ++i) { + run (&scalar, &vector); + if (best_scalar < 0 || best_scalar > scalar) + best_scalar = scalar; + if (best_vector < 0 || best_vector > vector) + best_vector = vector; + } + + printf("best scalar=%d, ", best_scalar); + printf("best vector=%d, ", best_vector); + printf("speedup=%f\n", ((float) best_scalar)/best_vector); + printf ("PASSED\n"); + return 0; + } + + void interquant_scalar( signed short* in, signed char* out, int qp) { + int i; + int qpd2 = (32768+qp)/(2*qp); + + for (i = 0; i < 64; ++i) { + short input = in[i]; + short t1 = (input == -32768) ? 32767 : -input; + unsigned short u1 = (unsigned short) ((input > t1) ? input : t1); + short t2 = (short) (u1 - (qp/2)); + t2 = (t2 > 0) ? t2 : 0; + //int t3 = (t2 * ((32768+qp)/(2*qp))) / 32768; + int t3 = (t2 * qpd2) /32768; + t3 = (t3 > 32767) ? 32767 : t3; + t3 = (t3 < -32768) ? -32768 : t3; + short t4 = (t3 < 127) ? t3 : 127; + out[i] = (input < 0) ? -t4 : t4; + } + } + _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits