Changes in directory llvm-test/SingleSource/UnitTests/Vector/Altivec:
alti.expandfft.c added (r1.1) alti.isamax.c added (r1.1) alti.sdot.c added (r1.1) alti.stepfft.c added (r1.1) --- Log message: Added some Altivec and SSE examples from: Introduction to Parallel Computing A practical guide with examples in C Oxford Texts in Applied and Engineering Mathematics No. 9 Oxford University Press, February 2004 ISBN: 0-19-851576-6 (hardback), 0-19-851577-4 (paperback) http://people.inf.ethz.ch/arbenz/book/ --- Diffs of the changes: (+725 -0) alti.expandfft.c | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ alti.isamax.c | 131 +++++++++++++++++++++++++ alti.sdot.c | 103 +++++++++++++++++++ alti.stepfft.c | 204 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 725 insertions(+) Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c:1.1 *** /dev/null Mon Apr 3 19:48:04 2006 --- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.expandfft.c Mon Apr 3 19:47:54 2006 *************** *** 0 **** --- 1,287 ---- + #include <stdio.h> + #include <math.h> + #include <time.h> + #include <float.h> + #define N 1048576 + #define N2 N/2 + main() + { + /* + Example of Apple Altivec coded binary radix FFT + using intrinsics from Petersen and Arbenz "Intro. + to Parallel Computing," Section 3.6 + + This is an expanded version of a generic work-space + FFT: steps are in-line. cfft2(n,x,y,w,sign) takes complex + n-array "x" (Fortran real,aimag,real,aimag,... order) + and writes its DFT in "y". Both input "x" and the + original contents of "y" are destroyed. Initialization + for array "w" (size n/2 complex of twiddle factors + (exp(twopi*i*k/n), for k=0..n/2-1)) is computed once + by cffti(n,w). + + WPP, SAM. Math. ETHZ, 1 June, 2002 + */ + + int first,i,icase,it,ln2,n; + int nits=1000000; + static float seed = 331.0; + float error,fnm1,sign,z0,z1,ggl(); + float *x,*y,*z,*w; + double t1,mflops; + void cffti(),cfft2(); + /* allocate storage for x,y,z,w on 4-word bndr. */ + x = (float *) malloc(8*N); + y = (float *) malloc(8*N); + z = (float *) malloc(8*N); + w = (float *) malloc(4*N); + n = 2; + for(ln2=1;ln2<21;ln2++){ + first = 1; + for(icase=0;icase<2;icase++){ + if(first){ + for(i=0;i<2*n;i+=2){ + z0 = ggl(&seed); /* real part of array */ + z1 = ggl(&seed); /* imaginary part of array */ + x[i] = z0; + z[i] = z0; /* copy of initial real data */ + x[i+1] = z1; + z[i+1] = z1; /* copy of initial imag. data */ + } + } else { + for(i=0;i<2*n;i+=2){ + z0 = 0; /* real part of array */ + z1 = 0; /* imaginary part of array */ + x[i] = z0; + z[i] = z0; /* copy of initial real data */ + x[i+1] = z1; + z[i+1] = z1; /* copy of initial imag. data */ + } + } + /* initialize sine/cosine tables */ + cffti(n,w); + /* transform forward, back */ + if(first){ + sign = 1.0; + cfft2(n,x,y,w,sign); + sign = -1.0; + cfft2(n,y,x,w,sign); + /* results should be same as initial multiplied by n */ + fnm1 = 1.0/((float) n); + error = 0.0; + for(i=0;i<2*n;i+=2){ + error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) + + (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]); + } + error = sqrt(fnm1*error); + printf(" for n=%d, fwd/bck error=%e\n",n,error); + first = 0; + } else { + t1 = ((double)clock())/((double) CLOCKS_PER_SEC); + for(it=0;it<nits;it++){ + sign = +1.0; + cfft2(n,x,y,w,sign); + sign = -1.0; + cfft2(n,y,x,w,sign); + } + t1 = ((double)clock())/((double) CLOCKS_PER_SEC) - t1; + t1 = 0.5*t1/((double) nits); + mflops = 5.0*((double) n)*((double) ln2)/((1.e+6)*t1); + printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops); + } + } + if((ln2%4)==0) nits /= 10; + n *= 2; + } + } + void cfft2(unsigned int n,float x[][2],float y[][2],float w[][2], float sign) + { + + /* + altivec version of cfft2 from Petersen and Arbenz book, "Intro. + to Parallel Computing", Oxford Univ. Press, 2003, Section 3.6 + wpp 14. Dec. 2003 + */ + + int jb,jc,jd,jw,k,k2,k4,lj,m,j,mj,mj2,pass,tgle; + float rp,up,wr[4],wu[4]; + float *a,*b,*c,*d; + const vector float vminus = (vector float)(-0.,0.,-0.,0.); + const vector float vzero = (vector float)(0.,0.,0.,0.); + const vector unsigned char pv3201 = + (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11); + vector float V0,V1,V2,V3,V4,V5,V6,V7; + vector float V8,V9,V10,V11,V12,V13,V14,V15; + + if(n<=1){ + y[0][0] = x[0][0]; + y[0][1] = x[0][1]; + return; + } + m = (int) (log((float) n)/log(1.99)); + mj = 1; + mj2 = 2; + lj = n/2; + /* first pass thru data: x -> y */ + for(j=0;j<lj;j++){ + jb = n/2+j; jc = j*mj2; jd = jc + 1; + rp = w[j][0]; up = w[j][1]; + if(sign<0.0) up = -up; + y[jd][0] = rp*(x[j][0] - x[jb][0]) - up*(x[j][1] - x[jb][1]); + y[jd][1] = up*(x[j][0] - x[jb][0]) + rp*(x[j][1] - x[jb][1]); + y[jc][0] = x[j][0] + x[jb][0]; + y[jc][1] = x[j][1] + x[jb][1]; + } + if(n==2) return; + /* next pass is mj = 2 */ + mj = 2; + mj2 = 4; + lj = n/4; + a = (float *)&y[0][0]; + b = (float *)&y[n/2][0]; + c = (float *)&x[0][0]; + d = (float *)&x[mj][0]; + if(n==4){ + c = (float *)&y[0][0]; + d = (float *)&y[mj][0]; + } + for(j=0;j<lj;j++){ + jw = j*mj; jc = j*mj2; jd = 2*jc; + rp = w[jw][0]; up = w[jw][1]; + if(sign<0.0) up = -up; + wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; + wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; + V6 = vec_ld(0,wr); + V7 = vec_ld(0,wu); + V7 = vec_xor(V7,vminus); + V0 = vec_ld(0,(vector float *) (a+jc)); + V1 = vec_ld(0,(vector float *) (b+jc)); + V2 = vec_add(V0,V1); /* a + b */ + vec_st(V2,0,(vector float *) (c+jd)); /* store c */ + V3 = vec_sub(V0,V1); /* a - b */ + V4 = vec_perm(V3,V3,pv3201); + V0 = vec_madd(V6,V3,vzero); + V1 = vec_madd(V7,V4,vzero); + V2 = vec_add(V0,V1); /* w*(a - b) */ + vec_st(V2,0,(vector float*) (d+jd)); /* store d */ + } + if(n==4) return; + mj *= 2; + mj2 = 2*mj; + lj = n/mj2; + tgle = 0; + for(pass=2;pass<m-1;pass++){ + if(tgle){ + a = (float *)&y[0][0]; + b = (float *)&y[n/2][0]; + c = (float *)&x[0][0]; + d = (float *)&x[mj][0]; + tgle = 0; + } else { + a = (float *)&x[0][0]; + b = (float *)&x[n/2][0]; + c = (float *)&y[0][0]; + d = (float *)&y[mj][0]; + tgle = 1; + } + for(j=0; j<lj; j++){ + jw = j*mj; jc = j*mj2; jd = 2*jc; + rp = w[jw][0]; + up = w[jw][1]; + if(sign<0.0) up = -up; + wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; + wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; + V6 = vec_ld(0,wr); + V7 = vec_ld(0,wu); + V7 = vec_xor(V7,vminus); + for(k=0; k<mj; k+=4){ + k2 = 2*k; k4 = k2+4; + V0 = vec_ld(0,(vector float *) (a+jc+k2)); + V1 = vec_ld(0,(vector float *) (b+jc+k2)); + V2 = vec_add(V0,V1); /* a + b */ + vec_st(V2,0,(vector float*) (c+jd+k2)); /* store c */ + V3 = vec_sub(V0,V1); /* a - b */ + V4 = vec_perm(V3,V3,pv3201); + V0 = vec_madd(V6,V3,vzero); + V1 = vec_madd(V7,V4,vzero); + V2 = vec_add(V0,V1); /* w*(a - b) */ + vec_st(V2,0,(vector float *) (d+jd+k2)); /* store d */ + V8 = vec_ld(0,(vector float *) (a+jc+k4)); + V9 = vec_ld(0,(vector float *) (b+jc+k4)); + V10 = vec_add(V8,V9); /* a + b */ + vec_st(V10,0,(vector float *) (c+jd+k4)); /* store c */ + V11 = vec_sub(V8,V9); /* a - b */ + V12 = vec_perm(V11,V11,pv3201); + V8 = vec_madd(V6,V11,vzero); + V9 = vec_madd(V7,V12,vzero); + V10 = vec_add(V8,V9); /* w*(a - b) */ + vec_st(V10,0,(vector float *) (d+jd+k4)); /* store d */ + } + } + mj *= 2; + mj2 = 2*mj; + lj = n/mj2; + } + /* last pass thru data: in-place if previous in y */ + c = (float *)&y[0][0]; + d = (float *)&y[n/2][0]; + if(tgle) { + a = (float *)&y[0][0]; + b = (float *)&y[n/2][0]; + } else { + a = (float *)&x[0][0]; + b = (float *)&x[n/2][0]; + } + for(k=0; k<(n/2); k+=4){ + k2 = 2*k; k4 = k2+4; + V0 = vec_ld(0,(vector float *) (a+k2)); + V1 = vec_ld(0,(vector float *) (b+k2)); + V2 = vec_add(V0,V1); /* a + b */ + vec_st(V2,0,(vector float*) (c+k2)); /* store c */ + V3 = vec_sub(V0,V1); /* a - b */ + vec_st(V3,0,(vector float *) (d+k2)); /* store d */ + V4 = vec_ld(0,(vector float *) (a+k4)); + V5 = vec_ld(0,(vector float *) (b+k4)); + V6 = vec_add(V4,V5); /* a + b */ + vec_st(V6,0,(vector float *) (c+k4)); /* store c */ + V7 = vec_sub(V4,V5); /* a - b */ + vec_st(V7,0,(vector float *) (d+k4)); /* store d */ + } + } + void cffti(int n, float w[][2]) + { + + /* initialization routine for cfft2: computes + cos(twopi*k),sin(twopi*k) for k=0..n/2-1 + - the "twiddle factors" for a binary radix FFT */ + + int i,n2; + float aw,arg,pi; + pi = 3.141592653589793; + n2 = n/2; + aw = 2.0*pi/((float)n); + for(i=0;i<n2;i++){ + arg = aw*((float)i); + w[i][0] = cos(arg); + w[i][1] = sin(arg); + } + } + #include <math.h> + float ggl(float *ds) + { + + /* generate u(0,1) distributed random numbers. + Seed ds must be saved between calls. ggl is + essentially the same as the IMSL routine RNUM. + + W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: + a modification of a fortran version from + I. Vattulainen, Tampere Univ. of Technology, + Finland, 1992 */ + + double t,d2=0.2147483647e10; + t = (float) *ds; + t = fmod(0.16807e5*t,d2); + *ds = (float) t; + return((float) ((t-1.0e0)/(d2-1.0e0))); + } Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c:1.1 *** /dev/null Mon Apr 3 19:48:09 2006 --- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.isamax.c Mon Apr 3 19:47:54 2006 *************** *** 0 **** --- 1,131 ---- + #include <stdio.h> + #include <math.h> + #include <float.h> + #define N 1027 + main() + { + /* + Mac G-4 unit step isamax for arbitrary N code + This is an Altivec version of isamax0 from Section 3.5.7 + in Arbenz and Petersen, "Intro. to Parallel Computing" + Oxford Univ. Press, 2004 + wpp 5/8/2002 + */ + float x[N]; + float xb; + int err,flag,i,im,k,ki,kl,ib,n0,n; + int isamax(int,float *); + static float seed = 331.0; + float ggl(float*); + flag = 0; // error flag + kl = 3; + n0 = 1; + for(k=0;k<5;k++){ + for(ki=0;ki<kl;ki++){ + n = n0 + ki; + ib = 0; xb = 0.0; + for(i=0;i<n;i++){ + x[i] = ggl(&seed); + if(fabs(x[i]) > xb){ + xb = fabs(x[i]); + ib = i; + } + } + im = isamax(n,x); + err = ib - im; + if(err != 0){ + printf(" err in isamax: n = %d, ib = %d, im = %d\n",n,ib,im); + flag = 1; + } + } + n0 *= 4; // increase n + kl = 4; // for n > 1, 3 steps of increase in n + } + if(flag==0) printf(" All n tests pass\n"); + } + #define NS 12 + int isamax(int n, float *x) + { + float rbig,*xp; + int i,ii,nres,nsegs,ibig,irbig; + vector float V0,V1,V6; + vector bool int V3; + vector float V2 = (vector float) (0.0,1.0,2.0,3.0); + vector float V7 = (vector float) (0.0,1.0,2.0,3.0); + const vector float incr_4 = (vector float) (4.0,4.0,4.0,4.0); + const vector float minus0 = (vector float) (-0.0,-0.0,-0.0,-0.0); + float big,xbig[4],indx[4]; + // n < NS done in scalar mode + if(n < NS){ + ibig = 0; + rbig = 0.0; + for(i=0;i<n;i++){ + if(fabs(x[i]) > rbig){ + rbig = fabs(x[i]); + ibig = i; + } + } + return(ibig); + } + // n >= NS case done with altivec + nsegs = (n >> 2) - 1; + nres = n - ((nsegs+1) << 2); // nres = n mod 4 + V2 = vec_add(V2,incr_4); // increment next index + xp = x; + V0 = vec_ld(0,xp); xp += 4; // first four + V1 = vec_ld(0,xp); xp += 4; // next four + V0 = vec_abs(V0); // absolute value of first four + for(i=0;i<nsegs;i++){ + V1 = vec_abs(V1); // take absolute value fo next segment + V3 = vec_cmpgt(V1,V0); // compare accumulated 4 to next 4 + V0 = vec_sel(V0,V1,V3); // select new or old accumulation + V7 = vec_sel(V7,V2,V3); // select new index or old + V1 = vec_ld(0,xp); xp += 4; // bottom load next 4 + V2 = vec_add(V2,incr_4); + } + V1 = vec_ld(0,xp); xp += 4; // bottom load next four + V3 = vec_cmpgt(V1,V0); // compare accumulated to last 4 + V0 = vec_sel(V0,V1,V3); // select accumulation to last 4 + V7 = vec_sel(V7,V2,V3); // select index of accum. to last 4 + // Now finish up: segment maxima are in V0, indices in V7 + vec_st(V0,0,xbig); + vec_st(V7,0,indx); + ii = ((n >> 2) << 2); + big = 0.0; + ibig = 0.0; + for(i=0;i<nres;i++){ + if(fabs(x[ii]) > big){ + big = fabs(x[ii]); + ibig = ii; + } + ii++; + } + for(i=0;i<4;i++){ + if(xbig[i] > big){ + big = xbig[i]; + ibig = (int) indx[i]; + } + } + return(ibig); + } + #undef NS + #include <math.h> + float ggl(float *ds) + { + + /* generate u(0,1) distributed random numbers. + Seed ds must be saved between calls. ggl is + essentially the same as the IMSL routine RNUM. + + W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: + a modification of a fortran version from + I. Vattulainen, Tampere Univ. of Technology, + Finland, 1992 */ + + double t,d2=0.2147483647e10; + t = (float) *ds; + t = fmod(0.16807e5*t,d2); + *ds = (float) t; + return((float) ((t-1.0e0)/(d2-1.0e0))); + } + Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c:1.1 *** /dev/null Mon Apr 3 19:48:09 2006 --- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.sdot.c Mon Apr 3 19:47:54 2006 *************** *** 0 **** --- 1,103 ---- + #include <stdio.h> + #include <math.h> + #include <float.h> + #define N 1027 + main() + { + /* Mac G-4 sdot for arbitrary N wpp 6/8/2002 */ + float x[N],y[N],tres,res,eps; + int flag,i,k,ki,kl,n0,n; + static float seed = 331.0; + float sdot(int,float *,float *); + float ggl(float *); + eps = FLT_EPSILON; /* machine eps */ + n0 = 1; kl = 3; + flag = 0; + for(k=0;k<5;k++){ + for(ki=0;ki<kl;ki++){ + n = n0 + ki; + tres = 0.0; + for(i=0;i<n;i++){ + x[i] = ggl(&seed); + y[i] = ggl(&seed); + tres = tres + x[i]*y[i]; + } + res = sdot(n,x,y); + if(fabs(tres-res) > ((float) n)*eps){ + flag = 1; + printf(" n = %d, test sdot value = %e, sdot value = %e\n", + n,tres,res); + } + } + n0 *= 4; + kl = 4; + } + if(flag == 0) printf(" All n tests passed\n"); + } + #define NS 12 + float sdot(int n, float *x, float *y) + { + float sum,*xp,*yp; + int i,ii,nres,nsegs; + vector float V7 = (vector float)(0.0,0.0,0.0,0.0); + vector float V0,V1; + float psum[4]; + // n < NS done in scalar mode + if(n < NS){ + sum = x[0]*y[0]; + for(i=1;i<n;i++){ + sum += x[i]*y[i]; + } + return(sum); + } + // n >= NS case done with altivec + xp = x; + yp = y; + V0 = vec_ld(0,xp); xp += 4; // increment next index + V1 = vec_ld(0,yp); yp += 4; // increment next index + nsegs = (n >> 2) - 1; + nres = n - ((nsegs+1) << 2); // nres = n mod 4 + for(i=0;i<nsegs;i++){ + V7 = vec_madd(V0,V1,V7); // partial sum gp. of 4 + V0 = vec_ld(0,xp); xp += 4; // bottom load next 4 x + V1 = vec_ld(0,yp); yp += 4; // bottom load next 4 y + } + V7 = vec_madd(V0,V1,V7); // final partial sum + // Now finish up: segment maxima are in V0, indices in V7 + vec_st(V7,0,psum); + if(nres > 0){ + ii = ((n >> 2) << 2); + sum = x[ii]*y[ii]; + ii++; + for(i=1;i<nres;i++){ + sum += x[ii]*y[ii]; + ii++; + } + } else { + sum = 0.0; + } + for(i=0;i<4;i++){ + sum += psum[i]; + } + return(sum); + } + #undef NS + #include <math.h> + float ggl(float *ds) + { + + /* generate u(0,1) distributed random numbers. + Seed ds must be saved between calls. ggl is + essentially the same as the IMSL routine RNUM. + + W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: + a modification of a fortran version from + I. Vattulainen, Tampere Univ. of Technology, + Finland, 1992 */ + + double t,d2=0.2147483647e10; + t = (float) *ds; + t = fmod(0.16807e5*t,d2); + *ds = (float) t; + return((float) ((t-1.0e0)/(d2-1.0e0))); + } Index: llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c diff -c /dev/null llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c:1.1 *** /dev/null Mon Apr 3 19:48:09 2006 --- llvm-test/SingleSource/UnitTests/Vector/Altivec/alti.stepfft.c Mon Apr 3 19:47:54 2006 *************** *** 0 **** --- 1,204 ---- + #include <stdio.h> + #include <math.h> + #include <time.h> + #include <float.h> + #define N 128 + #define N2 N/2 + main() + { + /* SSE version of cfft2 - uses Apple intrinsics + W. Petersen, SAM. Math. ETHZ 2 May, 2002 */ + int first,i,icase,it,n; + int nits=1000; /* number of iterations for timing test */ + float error,fnm1,sign,z0,z1,ggl(); + static float seed = 331.0; + float *x,*y,*z,*w; + float t1,ln2,mflops; + void cffti(),cfft2(); + /* allocate storage for x,y,z,w on 4-word bndr. */ + x = (float *)malloc(8*N); + y = (float *)malloc(8*N); + z = (float *)malloc(8*N); + w = (float *)malloc(4*N); + first = 1; + for(icase=0;icase<2;icase++){ + if(first){ + for(i=0;i<2*N;i+=2){ + z0 = ggl(&seed); /* real part of array */ + z1 = ggl(&seed); /* imaginary part of array */ + x[i] = z0; + z[i] = z0; /* copy of initial real data */ + x[i+1] = z1; + z[i+1] = z1; /* copy of initial imag. data */ + } + } else { + for(i=0;i<2*N;i+=2){ + z0 = 0; /* real part of array */ + z1 = 0; /* imaginary part of array */ + x[i] = z0; + z[i] = z0; /* copy of initial real data */ + x[i+1] = z1; + z[i+1] = z1; /* copy of initial imag. data */ + } + } + /* initialize sine/cosine tables */ + n = N; + cffti(n,w); + /* transform forward, back */ + if(first){ + sign = 1.0; + cfft2(n,x,y,w,sign); + sign = -1.0; + cfft2(n,y,x,w,sign); + /* results should be same as initial multiplied by N */ + fnm1 = 1.0/((float) n); + error = 0.0; + for(i=0;i<2*N;i+=2){ + error += (z[i] - fnm1*x[i])*(z[i] - fnm1*x[i]) + + (z[i+1] - fnm1*x[i+1])*(z[i+1] - fnm1*x[i+1]); + } + error = sqrt(fnm1*error); + printf(" for n=%d, fwd/bck error=%e\n",N,error); + first = 0; + } else { + t1 = ((float)clock())/((float) CLOCKS_PER_SEC); + for(it=0;it<nits;it++){ + sign = +1.0; + cfft2(n,x,y,w,sign); + sign = -1.0; + cfft2(n,y,x,w,sign); + } + t1 = ((float)clock())/((float) CLOCKS_PER_SEC) - t1; + t1 = t1/(2.0*((float) nits)); + ln2 = 10.0; /* reset this for different N */ + mflops = 5.0*((float) N)*ln2/((1.e+6)*t1); + printf(" for n=%d, t1=%e, mflops=%e\n",n,t1,mflops); + } + } + } + void cfft2(n,x,y,w,sign) + int n; + float x[][2],y[][2],w[][2],sign; + { + int jb, m, j, mj, tgle; + void ccopy(),step(); + m = (int) (log((float) n)/log(1.99)); + mj = 1; + tgle = 1; /* toggling switch for work array */ + step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign); + for(j=0;j<m-2;j++){ + mj *= 2; + if(tgle){ + step(n,mj,&y[0][0],&y[n/2][0],&x[0][0],&x[mj][0],w,sign); + tgle = 0; + } else { + step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign); + tgle = 1; + } + } + /* last pass thru data: move y to x if needed */ + if(tgle) { + ccopy(n,y,x); + } + mj = n/2; + step(n,mj,&x[0][0],&x[n/2][0],&y[0][0],&y[mj][0],w,sign); + } + void cffti(int n, float w[][2]) + { + int i,n2; + float aw,arg,pi; + pi = 3.141592653589793; + n2 = n/2; + aw = 2.0*pi/((float)n); + for(i=0;i<n2;i++){ + arg = aw*((float)i); + w[i][0] = cos(arg); + w[i][1] = sin(arg); + } + } + void ccopy(int n, float x[][2], float y[][2]) + { + int i; + for(i=0;i<n;i++){ + y[i][0] = x[i][0]; + y[i][1] = x[i][1]; + } + } + #define cplx __complex__ + #define Re __real__ + #define Im __imag__ + void step + (unsigned int n,unsigned int mj, + cplx float *a, __complex__ float *b, + cplx float *c, __complex__ float *d, + cplx float *w, float sign) + { + int j,k,jc,jw,l,lj,mj2; + float rp,up; + float wr[4], wu[4]; + const vector float vminus = (vector float)(-0.,0.,-0.,0.); + const vector float vzero = (vector float)(0.,0.,0.,0.); + const vector unsigned char pv3201 = + (vector unsigned char)(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11); + vector float v0,v1,v2,v3,v4,v5,v6,v7; + + mj2 = 2*mj; + lj = n/mj2; + + for(j=0; j<lj; j++){ + jw = j*mj; jc = j*mj2; + rp = Re w[jw]; + up = Im w[jw]; + if(sign<0.0) up = -up; + if(mj<2){ + /* special case mj=1 */ + Re d[jc] = rp*(Re a[jw] - Re b[jw]) - + up*(Im a[jw] - Im b[jw]); + Im d[jc] = up*(Re a[jw] - Re b[jw]) + + rp*(Im a[jw] - Im b[jw]); + Re c[jc] = Re a[jw] + Re b[jw]; + Im c[jc] = Im a[jw] + Im b[jw]; + } else { + /* mj>=2 case */ + wr[0] = rp; wr[1] = rp; wr[2] = rp; wr[3] = rp; + wu[0] = up; wu[1] = up; wu[2] = up; wu[3] = up; + v6 = vec_ld(0,wr); + v7 = vec_ld(0,wu); + v7 = vec_xor(v7,vminus); + for(k=0; k<mj; k+=2){ + v0 = vec_ld(0,(vector float *) &a[jw+k]); /* read a */ + v1 = vec_ld(0,(vector float *) &b[jw+k]); /* read b */ + v2 = vec_add(v0, v1); + vec_st(v2,0,(vector float *) &c[jc+k]); /* store c */ + v3 = vec_sub(v0, v1); + v4 = vec_perm(v3,v3,pv3201); + v0 = vec_madd(v6,v3,vzero); + v1 = vec_madd(v7,v4,vzero); + v2 = vec_add(v0,v1); /* w*(a - b) */ + vec_st(v2,0,(vector float *) &d[jc+k]); /* store a */ + } + } + } + } + #undef cplx + #undef Re + #undef Im + #include <math.h> + float ggl(float *ds) + { + + /* generate u(0,1) distributed random numbers. + Seed ds must be saved between calls. ggl is + essentially the same as the IMSL routine RNUM. + + W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: + a modification of a fortran version from + I. Vattulainen, Tampere Univ. of Technology, + Finland, 1992 */ + + double t,d2=0.2147483647e10; + t = (float) *ds; + t = fmod(0.16807e5*t,d2); + *ds = (float) t; + return((float) ((t-1.0e0)/(d2-1.0e0))); + } _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits