Path: utzoo!utgpu!watmath!clyde!att!osu-cis!tut.cis.ohio-state.edu!unmvax!ncar!tank!nic.MR.NET!umn-cs!hall!tdg From: tdg@hall.cray.com (Terry Greyzck) Newsgroups: comp.arch Subject: Re: Lawerence Livermore Loops in C Keywords: loops, livermore, kernel Message-ID: <12074@hall.cray.com> Date: 12 Dec 88 17:38:00 GMT References: <584@gt-eedsp.UUCP> <335@taux02.UUCP> Reply-To: tdg@hall.UUCP (Terry Greyzck) Organization: Cray Research, Inc., Santa Fe, NM Lines: 442 In article <335@taux02.UUCP> yuval@taux02.UUCP (Gideon Yuval) writes: >In article <584@gt-eedsp.UUCP> schw@gt-eedsp.UUCP (Dave Schwartz) writes: >> >>If you have a copy of the Lawerence Livermore Loops set that has been >>ported to C. I would appreciate if you would send me a copy. > >Me too, please. > There seems to be some interest, so I'll post the 14-loop version here. I am not aware of a C version of the 24-loop kernels, although someone at the National Magnetic Fusion Energy Computing Center (NMFECC, or just MFE) was working on one some time back. If you're into serious benchmarking, get the 24-loop version. If you're just curious about machine speed, the 14-loop version is probably okay, and it is much, much, shorter. The C version includes a checksum self-check, so you will know if you obtained the correct answers. Someone should retrofit this into the Fortran version. The only system dependency is the use of the clock() function to measure CPU time used. Set the variable 'scale' to indicate what multiple of microseconds clock() returns on your system. The opinions, et cetera, in this posting are my own and do not reflect those of anyone else. At least, not intentionally. #! /bin/sh # This is a shell archive. Remove anything before this line, then unpack # it by saving it into a file and typing "sh file". To overwrite existing # files, type "sh file -c". You can also feed this as standard input via # unshar, or by typing "sh 'lloops.c' <<'END_OF_FILE' X/* X * program analysis evaluates execution rates of c/fortran do-loops. X * through-put is measured in units of millions of floating-point X * operations per second, called mflops. X */ X#define MAXERR 1.0e-6 X X#include X#include X Xdouble x[1002], y[1002], z[1002], u[501], px[16][101], cx[16][101]; Xdouble u1[6][23][3], u2[6][23][3], u3[6][23][3]; Xdouble b[65][9], bnk1[6], c[65][9], bnk2[6], p[5][513], bnk3[6]; Xdouble h[65][9], bnk4[6], bnk5[6], ex[68]; Xdouble rh[68], dex[68], vx[151], xx[151], grd[151]; Xint e[193], f[193]; X Xlong nrops[] = { 0, 5, 10, 2, 2, X 2, 2, 16, 36, X 17, 9, 1, 1, X 7, 11 }; X Xlong loops[] = { 0, 400, 200,1000, 510, X 1000,1000, 120, 40, X 100, 100,1000,1000, X 128, 150 }; X Xdouble checks[] = { 0, 0.811986948148e+07, 0.356310000000e+03, X 0.356310000000e+03, -0.402412007078e+05, X 0.136579037764e+06, 0.419716278716e+06, X 0.429449847526e+07, 0.314064400000e+06, X 0.182709000000e+07, -0.140415250000e+09, X 0.374895020500e+09, 0.000000000000e+00, X 0.171449024000e+06, -0.510829560800e+07 }; X Xmain() X{ X extern void init(); X extern long clock(); X X register int nt, lw, nl1, nl2; X register int i, i1, i2, ip, ir, ix, j, j1, j2, k, kx, ky, l, m; X double ts[21], rt[21], rpm[21], cksum[21]; X double r, t, a11, a12, a13, sig, a21, a22, a23, a31, a32, a33; X double bm28, bm27, bm26, bm25, bm24, bm23, bm22, c0, flx, rx1; X register double q, s, scale, uu, du1, du2, du3, ar, br, cr, xi, ri; X long mops[20]; X X for (i=1; i<=20; i++) X cksum[i] = 0.0; X r = 4.86; t = 276.0; a11 = 0.5; X a12 = 0.33; a13 = 0.25; sig = 0.8; X a21 = 0.20; a22 = 0.167; a23 = 0.141; X a31 = 0.125; a32 = 0.111; a33 = 0.10; X bm28 = 0.1; bm27 = 0.2; bm26 = 0.3; X bm25 = 0.4; bm24 = 0.5; bm23 = 0.6; X bm22 = 0.7; c0 = 0.8; flx = 4.689; X rx1 = 64.0; X/* X * end of initialization -- begin timing X */ X X/* loop 1 hydro excerpt */ X X init(); X ts[1] = (double) clock(); X q = 0.0; X for (k=1; k<=400; k++) X x[k] = q+y[k]*(r*z[k+10]+t*z[k+11]); X ts[1] = (double) clock() - ts[1]; X for (k=1; k<=400; k++) X cksum[1] += (double)k * x[k]; X X/* loop 2 mlr, inner product */ X X init(); X ts[2] = (double) clock(); X q = 0.0; X for (k=1; k<=996; k+=5) X q += z[k]*x[k]+z[k+1]*x[k+1]+z[k+2]*x[k+2]+ X z[k+3]*x[k+3]+z[k+4]*x[k+4]; X ts[2] = (double) clock() - ts[2]; X cksum[2] = q; X X/* loop 3 inner prod */ X X init(); X ts[3] = (double) clock(); X q = 0.0; X for (k=1; k<=1000; k++) X q += z[k]*x[k]; X ts[3] = (double) clock() - ts[3]; X cksum[3] = q; X X/* loop 4 banded linear equarions */ X X init(); X ts[4] = (double) clock(); X for (l=7; l<=107; l+=50) { X lw=l; X for (j=30; j<=870; j+=5) X x[l-1] -= x[lw++]*y[j]; X x[l-1] = y[5]*x[l-1]; X } X ts[4] = (double) clock() - ts[4]; X for (l=7; l<=107; l+=50) X cksum[4] += (double) l * x[l-1]; X X/* loop 5 tri-diagonal elimination, below diagonal */ X X init(); X ts[5] = (double) clock(); X for (i=2; i<=998; i+=3) { X x[i] = z[i]*(y[i]-x[i-1]); X x[i+1] = z[i+1]*(y[i+1]-x[i]); X x[i+2] = z[i+2]*(y[i+2]-x[i+1]); X } X ts[5] = (double) clock() - ts[5]; X for (i=2; i<=1000; i++) X cksum[5] += (double) i * x[i]; X X/* loop 6 tri-diagonal elimination, above diagonal */ X X init(); X ts[6] = (double) clock(); X for (j=3; j<=999; j+=3) { X i = 1003-j; X x[i] = x[i]-z[i]*x[i+1]; X x[i-1] = x[i-1]-z[i-1]*x[i]; X x[i-2] = x[i-2]-z[i-2]*x[i-1]; X } X ts[6] = (double) clock() - ts[6]; X for (j=1; j<=999; j++) { X l = 1001-j; X cksum[6] += (double) j * x[l]; X } X X/* loop 7 equation of state excerpt */ X X init(); X ts[7] = (double) clock(); X for (m=1; m<=120; m++) X x[m] = u[m]+r*(z[m]+r*y[m])+ X t*(u[m+3]+r*(u[m+2]+r*u[m+1])+ X t*(u[m+6]+r*(u[m+5]+r*u[m+4]))); X ts[7] = (double) clock() - ts[7]; X for (m=1; m<=120; m++) X cksum[7] += (double) m * x[m]; X X/* loop 8 p.d.e. integration */ X X init(); X ts[8] = (double) clock(); X nl1 = 1; nl2 = 2; X for (kx=2; kx<=3; kx++) { X for (ky=2; ky<=21; ky++) { X du1 = u1[kx][ky+1][nl1]-u1[kx][ky-1][nl1]; X du2 = u2[kx][ky+1][nl1]-u2[kx][ky-1][nl1]; X du3 = u3[kx][ky+1][nl1]-u3[kx][ky-1][nl1]; X u1[kx][ky][nl2] = u1[kx][ky][nl1]+a11*du1+ X a12*du2+a13*du3+sig*(u1[kx+1][ky][nl1] X -2.0*u1[kx][ky][nl1]+u1[kx-1][ky][nl1]); X u2[kx][ky][nl2] = u2[kx][ky][nl1]+a21*du1+ X a22*du2+a23*du3+sig*(u2[kx+1][ky][nl1] X -2.0*u2[kx][ky][nl1]+u2[kx-1][ky][nl1]); X u3[kx][ky][nl2] = u3[kx][ky][nl1]+a31*du1+ X a32*du2+a33*du3+sig*(u3[kx+1][ky][nl1] X -2.0*u3[kx][ky][nl1]+u3[kx-1][ky][nl1]); X } X } X ts[8] = (double) clock() - ts[8]; X for (i=1; i<=2; i++) X for (kx=2; kx<=3; kx++) X for (ky=2; ky<=21; ky++) X cksum[8] += (double) kx * (double) ky * X (double) i * (u1[kx][ky][i]+ X u2[kx][ky][i]+u3[kx][ky][i]); X X/* loop 9 integrate predictors */ X X init(); X ts[9] = (double) clock(); X for (i=1; i<=100; i++) X px[1][i] = bm28*px[13][i] + bm27*px[12][i] + X bm26*px[11][i] + bm25*px[10][i] + bm24*px[9][i] + X bm23*px[8][i] + bm22*px[7][i] + X c0*(px[5][i] + px[6][i]) + px[3][i]; X ts[9] = (double) clock() - ts[9]; X for (i=1; i<=100; i++) X cksum[9] += (double) i * px[1][i]; X X/* loop 10 difference predictors */ X X init(); X ts[10] = (double) clock(); X for (i=1; i<=100; i++) { X ar = cx[5][i]; X br = ar-px[5][i]; X px[5][i] = ar; X cr = br-px[6][i]; X px[6][i] = br; X ar = cr-px[7][i]; X px[7][i] = cr; X br = ar-px[8][i]; X px[8][i] = ar; X cr = br-px[9][i]; X px[9][i] = br; X ar = cr-px[10][i]; X px[10][i] = cr; X br = ar-px[11][i]; X px[11][i] = ar; X cr = br-px[12][i]; X px[12][i] = br; X px[14][i] = cr-px[13][i]; X px[13][i] = cr; X } X ts[10] = (double) clock() - ts[10]; X for (i=1; i<=100; i++) X for (k=5; k<=14; k++) X cksum[10] += (double) k * (double) i * px[k][i]; X X/* loop 11 first sum. */ X X init(); X ts[11] = (double) clock(); X x[1] = y[1]; X for (k=2; k<=1000; k++) X x[k] = x[k-1]+y[k]; X ts[11] = (double) clock() - ts[11]; X for (k=1; k<=1000; k++) X cksum[11] += (double) k * x[k]; X X/* loop 12 first diff. */ X X init(); X ts[12] = (double) clock(); X for (k=1; k<=999; k++) X x[k] = y[k+1]-y[k]; X ts[12] = (double) clock() - ts[12]; X for (k=1; k<=999; k++) X cksum[12] += (double) k * x[k]; X X X/* loop 13 2-d particle pusher */ X X init(); X ts[13] = (double) clock(); X for (ip=1; ip<=128; ip++) { X i1 = p[1][ip]; j1 = p[2][ip]; X p[3][ip] += b[i1][j1]; X p[4][ip] += c[i1][j1]; X p[1][ip] += p[3][ip]; X p[2][ip] += p[4][ip]; X i2 = (int) p[1][ip]; X j2 = (int) p[2][ip]; X p[1][ip] += y[i2+32]; X p[2][ip] += z[j2+32]; X i2 += e[i2+32]; j2 += f[j2+32]; X h[i2][j2] += 1.0; X } X ts[13] = (double) clock() - ts[13]; X for (ip=1; ip<=128; ip++) X cksum[13] += (double) ip * (p[3][ip]+p[4][ip]+p[1][ip]+ X p[2][ip]); X for (k=1; k<=64; k++) X for (ix=1; ix<=8; ix++) X cksum[13] += (double) k * (double) ix * h[k][ix]; X X/* loop 14 1-d particle pusher */ X X init(); X ts[14] = (double) clock(); X for (k=1; k<=150; k++) { X ix = (int) grd[k]; X xi = (double) ix; X vx[k] += ex[ix]+(xx[k]-xi)*dex[ix]; X xx[k] += vx[k]+flx; X ir = (int) xx[k]; X ri = (double) ir; X rx1 = xx[k]-ri; X ir = abs(ir % 64); X xx[k] = ri+rx1; X rh[ir] += 1.0-rx1; X rh[ir+1] += rx1; X } X ts[14] = (double) clock() - ts[14]; X for (k=1; k<=150; k++) X cksum[14] += (double) k * (vx[k]+xx[k]); X for (k=1; k<=67; k++) X cksum[14] += (double) k * rh[k]; X X/* time the clock call */ X X ts[15] = (double) clock(); X ts[15] = (double) clock() - ts[15]; X X/* scale= set to convert time to micro-seconds */ X X scale=1.0; X rt[15] = ts[15]*scale; X printf("clock overhead = %9.2f usec\n",rt[15]); X X nt = 14.0; X t = s = uu = 0.0; X for (k=1; k<=nt; k++) { X rt[k] = (ts[k]-ts[15])*scale; X t += rt[k]; X mops[k] = nrops[k]*loops[k]; X s += (double) mops[k]; X rpm[k] = 0.0; X if (rt[k] != 0.0) X rpm[k] = (double) mops[k]/rt[k]; X uu += rpm[k]; X } X uu /= (double) nt; X s /= t; X printf("\nloop checksum flops time mflops \n"); X for (k=1; k<=nt; k++) { X printf("%4d %20.12e%7d%9.1f%9.3f\n",k,cksum[k],mops[k], X rt[k],rpm[k]); X if (fabs(cksum[k]-checks[k]) > fabs(checks[k]*MAXERR)) X printf(" %20.12e *** expected checksum\n", X checks[k]); X } X printf("\n average mflops=%9.3f\n",uu); X} Xvoid Xinit() X{ X register int j, k, l; X X for (k=1; k<=1000; k++) { X x[k] = 1.11; X y[k] = 1.123; X z[k] = 0.321; X } X X for (k=1; k<=500; k++) X u[k] = 0.00025; X X for (k=1; k<=15; k++) { X for (l=1; l<=100; l++) { X px[k][l] = l; X cx[k][l] = l; X } X } X X for (j=1; j<6; j++) { X for (k=1; k<23; k++) { X for (l=1; l<3; l++) { X u1[j][k][l]=k; X u2[j][k][l]=k+k; X u3[j][k][l]=k+k+k; X } X } X } X X for (j=1; j<65; j++) { X for (k=1; k<9; k++) { X b[j][k] = 1.00025; X c[j][k] = 1.00025; X h[j][k] = 1.00025; X } X } X X for (j=1; j<6; j++) { X bnk1[j] = j*100; X bnk2[j] = j*110; X bnk3[j] = j*120; X bnk4[j] = j*130; X bnk5[j] = j*140; X } X X for (j=1; j<5; j++) { X for (k=1; k<513; k++) { X p[j][k] = 1.00025; X } X } X X for (j=1; j<193; j++) X e[j] = f[j] = 1; X X for (j=1; j<68; j++) { X ex[j] = rh[j] = dex[j] = (double) j; X } X X for (j=1; j<151; j++) { X vx[j] = 0.001; X xx[j] = 0.001; X grd[j] = (double) (j/8+3); X } X} END_OF_FILE if test 13098 -ne `wc -c <'lloops.c'`; then echo shar: \"'lloops.c'\" unpacked with wrong size! fi # end of 'lloops.c' fi echo shar: End of shell archive. exit 0