Path: utzoo!utgpu!watmath!clyde!att!osu-cis!tut.cis.ohio-state.edu!unmvax!ncar!tank!nic.MR.NET!umn-cs!hall!tdg
From: tdg@hall.cray.com (Terry Greyzck)
Newsgroups: comp.arch
Subject: Re: Lawerence Livermore Loops in C
Keywords: loops, livermore, kernel
Message-ID: <12074@hall.cray.com>
Date: 12 Dec 88 17:38:00 GMT
References: <584@gt-eedsp.UUCP> <335@taux02.UUCP>
Reply-To: tdg@hall.UUCP (Terry Greyzck)
Organization: Cray Research, Inc., Santa Fe, NM
Lines: 442

In article <335@taux02.UUCP> yuval@taux02.UUCP (Gideon Yuval) writes:
>In article <584@gt-eedsp.UUCP> schw@gt-eedsp.UUCP (Dave Schwartz) writes:
>>
>>If you have a copy of the Lawerence Livermore Loops set that has been
>>ported to C. I would appreciate if you would send me a copy.
>
>Me too, please.
>
There seems to be some interest, so I'll post the 14-loop version here.
I am not aware of a C version of the 24-loop kernels, although someone
at the National Magnetic Fusion Energy Computing Center (NMFECC, or just
MFE) was working on one some time back.  If you're into serious
benchmarking, get the 24-loop version.  If you're just curious about
machine speed, the 14-loop version is probably okay, and it is much,
much, shorter.

The C version includes a checksum self-check, so you will know if you
obtained the correct answers.  Someone should retrofit this into the
Fortran version.

The only system dependency is the use of the clock() function to
measure CPU time used.  Set the variable 'scale' to indicate what
multiple of microseconds clock() returns on your system.

The opinions, et cetera, in this posting are my own and do not reflect
those of anyone else.  At least, not intentionally.

#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  lloops.c
# Wrapped by tdg@zia on Mon Dec 12 10:22:25 1988
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'lloops.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'lloops.c'\"
else
echo shar: Extracting \"'lloops.c'\" \(13098 characters\)
sed "s/^X//" >'lloops.c' <<'END_OF_FILE'
X/*
X *     program analysis evaluates execution rates of c/fortran do-loops.
X *     through-put is measured in units of millions of floating-point
X *     operations per second, called mflops.
X */
X#define MAXERR  1.0e-6
X 
X#include <stdio.h>
X#include <math.h>
X 
Xdouble  x[1002], y[1002], z[1002], u[501], px[16][101], cx[16][101];
Xdouble  u1[6][23][3], u2[6][23][3], u3[6][23][3];
Xdouble  b[65][9], bnk1[6], c[65][9], bnk2[6], p[5][513], bnk3[6];
Xdouble  h[65][9], bnk4[6], bnk5[6], ex[68];
Xdouble  rh[68], dex[68], vx[151], xx[151], grd[151];
Xint     e[193], f[193];
X 
Xlong    nrops[] = {  0,  5, 10,  2,  2,
X                         2,  2, 16, 36,
X                        17,  9,  1,  1,
X                         7, 11 };
X 
Xlong    loops[] = {   0, 400, 200,1000, 510,
X                        1000,1000, 120,  40,
X                         100, 100,1000,1000,
X                         128, 150 };
X 
Xdouble  checks[] = {  0,  0.811986948148e+07,  0.356310000000e+03,
X                          0.356310000000e+03, -0.402412007078e+05,
X                          0.136579037764e+06,  0.419716278716e+06,
X                          0.429449847526e+07,  0.314064400000e+06,
X                          0.182709000000e+07, -0.140415250000e+09,
X                          0.374895020500e+09,  0.000000000000e+00,
X                          0.171449024000e+06, -0.510829560800e+07 };
X 
Xmain()
X{
X        extern void     init();
X        extern long     clock();
X 
X        register int    nt, lw, nl1, nl2;
X        register int    i, i1, i2, ip, ir, ix, j, j1, j2, k, kx, ky, l, m;
X        double  ts[21], rt[21], rpm[21], cksum[21];
X        double  r, t, a11, a12, a13, sig, a21, a22, a23, a31, a32, a33;
X        double  bm28, bm27, bm26, bm25, bm24, bm23, bm22, c0, flx, rx1;
X        register double q, s, scale, uu, du1, du2, du3, ar, br, cr, xi, ri;
X        long    mops[20];
X
X        for (i=1; i<=20; i++)
X                cksum[i] = 0.0;
X        r = 4.86;        t = 276.0;        a11 = 0.5;
X        a12 = 0.33;      a13 = 0.25;       sig = 0.8;
X        a21 = 0.20;      a22 = 0.167;      a23 = 0.141;
X        a31 = 0.125;     a32 = 0.111;      a33 = 0.10;
X        bm28 = 0.1;      bm27 = 0.2;       bm26 = 0.3;
X        bm25 = 0.4;      bm24 = 0.5;       bm23 = 0.6;
X        bm22 = 0.7;      c0 = 0.8;         flx = 4.689;
X        rx1 = 64.0;
X/*
X *     end of initialization -- begin timing
X */
X 
X/* loop 1      hydro excerpt */
X 
X        init();
X        ts[1] = (double) clock();
X        q = 0.0;
X        for (k=1; k<=400; k++)
X                x[k] = q+y[k]*(r*z[k+10]+t*z[k+11]);
X        ts[1] = (double) clock() - ts[1];
X        for (k=1; k<=400; k++)
X                cksum[1] += (double)k * x[k];
X 
X/* loop 2      mlr, inner product */
X 
X        init();
X        ts[2] = (double) clock();
X        q = 0.0;
X        for (k=1; k<=996; k+=5)
X                q += z[k]*x[k]+z[k+1]*x[k+1]+z[k+2]*x[k+2]+
X                        z[k+3]*x[k+3]+z[k+4]*x[k+4];
X        ts[2] = (double) clock() - ts[2];
X        cksum[2] = q;
X 
X/* loop 3      inner prod */
X 
X        init();
X        ts[3] = (double) clock();
X        q = 0.0;
X        for (k=1; k<=1000; k++)
X                q += z[k]*x[k];
X        ts[3] = (double) clock() - ts[3];
X        cksum[3] = q;
X 
X/* loop 4      banded linear equarions */
X 
X        init();
X        ts[4] = (double) clock();
X        for (l=7; l<=107; l+=50) {
X                lw=l;
X                for (j=30; j<=870; j+=5)
X                        x[l-1] -= x[lw++]*y[j];
X                x[l-1] = y[5]*x[l-1];
X                }
X        ts[4] = (double) clock() - ts[4];
X        for (l=7; l<=107; l+=50)
X                cksum[4] += (double) l * x[l-1];
X 
X/* loop 5      tri-diagonal elimination, below diagonal */
X 
X        init();
X        ts[5] = (double) clock();
X        for (i=2; i<=998; i+=3)  {
X                x[i] = z[i]*(y[i]-x[i-1]);
X                x[i+1] = z[i+1]*(y[i+1]-x[i]);
X                x[i+2] = z[i+2]*(y[i+2]-x[i+1]);
X                }
X        ts[5] = (double) clock() - ts[5];
X        for (i=2; i<=1000; i++)
X                cksum[5] += (double) i * x[i];
X 
X/* loop 6      tri-diagonal elimination, above diagonal */
X 
X        init();
X        ts[6] = (double) clock();
X        for (j=3; j<=999; j+=3) {
X                i = 1003-j;
X                x[i] = x[i]-z[i]*x[i+1];
X                x[i-1] = x[i-1]-z[i-1]*x[i];
X                x[i-2] = x[i-2]-z[i-2]*x[i-1];
X                }
X        ts[6] = (double) clock() - ts[6];
X        for (j=1; j<=999; j++)  {
X                l = 1001-j;
X                cksum[6] += (double) j * x[l];
X                }
X 
X/* loop 7      equation of state excerpt */
X 
X        init();
X        ts[7] = (double) clock();
X        for (m=1; m<=120; m++)
X                x[m] = u[m]+r*(z[m]+r*y[m])+
X                       t*(u[m+3]+r*(u[m+2]+r*u[m+1])+
X                       t*(u[m+6]+r*(u[m+5]+r*u[m+4])));
X        ts[7] = (double) clock() - ts[7];
X        for (m=1; m<=120; m++)
X                cksum[7] += (double) m * x[m];
X 
X/* loop 8      p.d.e. integration */
X 
X        init();
X        ts[8] = (double) clock();
X        nl1 = 1;        nl2 = 2;
X        for (kx=2; kx<=3; kx++)  {
X                for (ky=2; ky<=21; ky++) {
X                        du1 = u1[kx][ky+1][nl1]-u1[kx][ky-1][nl1];
X                        du2 = u2[kx][ky+1][nl1]-u2[kx][ky-1][nl1];
X                        du3 = u3[kx][ky+1][nl1]-u3[kx][ky-1][nl1];
X                        u1[kx][ky][nl2] = u1[kx][ky][nl1]+a11*du1+
X                                a12*du2+a13*du3+sig*(u1[kx+1][ky][nl1]
X                                -2.0*u1[kx][ky][nl1]+u1[kx-1][ky][nl1]);
X                        u2[kx][ky][nl2] = u2[kx][ky][nl1]+a21*du1+
X                                a22*du2+a23*du3+sig*(u2[kx+1][ky][nl1]
X                                -2.0*u2[kx][ky][nl1]+u2[kx-1][ky][nl1]);
X                        u3[kx][ky][nl2] = u3[kx][ky][nl1]+a31*du1+
X                                a32*du2+a33*du3+sig*(u3[kx+1][ky][nl1]
X                                -2.0*u3[kx][ky][nl1]+u3[kx-1][ky][nl1]);
X                        }
X                }
X        ts[8] = (double) clock() - ts[8];
X        for (i=1; i<=2; i++)
X                for (kx=2; kx<=3; kx++)
X                        for (ky=2; ky<=21; ky++)
X                                cksum[8] += (double) kx * (double) ky *
X                                        (double) i * (u1[kx][ky][i]+
X                                        u2[kx][ky][i]+u3[kx][ky][i]);
X 
X/* loop 9      integrate predictors */
X 
X        init();
X        ts[9] = (double) clock();
X        for (i=1; i<=100; i++)
X                px[1][i] = bm28*px[13][i] + bm27*px[12][i] +
X                        bm26*px[11][i] + bm25*px[10][i] + bm24*px[9][i] +
X                        bm23*px[8][i] + bm22*px[7][i] +
X                        c0*(px[5][i] + px[6][i]) + px[3][i];
X        ts[9] = (double) clock() - ts[9];
X        for (i=1; i<=100; i++)
X                cksum[9] += (double) i * px[1][i];
X 
X/* loop 10     difference predictors */
X 
X        init();
X        ts[10] = (double) clock();
X        for (i=1; i<=100; i++)   {
X                ar = cx[5][i];
X                br = ar-px[5][i];
X                px[5][i] = ar;
X                cr = br-px[6][i];
X                px[6][i] = br;
X                ar = cr-px[7][i];
X                px[7][i] = cr;
X                br = ar-px[8][i];
X                px[8][i] = ar;
X                cr = br-px[9][i];
X                px[9][i] = br;
X                ar = cr-px[10][i];
X                px[10][i] = cr;
X                br = ar-px[11][i];
X                px[11][i] = ar;
X                cr = br-px[12][i];
X                px[12][i] = br;
X                px[14][i] = cr-px[13][i];
X                px[13][i] = cr;
X                }
X        ts[10] = (double) clock() - ts[10];
X        for (i=1; i<=100; i++)
X                for (k=5; k<=14; k++)
X                        cksum[10] += (double) k * (double) i * px[k][i];
X 
X/* loop 11     first sum. */
X 
X        init();
X        ts[11] = (double) clock();
X        x[1] = y[1];
X        for (k=2; k<=1000; k++)
X                x[k] = x[k-1]+y[k];
X        ts[11] = (double) clock() - ts[11];
X        for (k=1; k<=1000; k++)
X                cksum[11] += (double) k * x[k];
X 
X/* loop 12     first diff. */
X 
X        init();
X        ts[12] = (double) clock();
X        for (k=1; k<=999; k++)
X                x[k] = y[k+1]-y[k];
X        ts[12] = (double) clock() - ts[12];
X        for (k=1; k<=999; k++)
X                cksum[12] += (double) k * x[k];
X 
X 
X/* loop 13      2-d particle pusher */
X 
X        init();
X        ts[13] = (double) clock();
X        for (ip=1; ip<=128; ip++)        {
X                i1 = p[1][ip];  j1 = p[2][ip];
X                p[3][ip] += b[i1][j1];
X                p[4][ip] += c[i1][j1];
X                p[1][ip] += p[3][ip];
X                p[2][ip] += p[4][ip];
X                i2 = (int) p[1][ip];
X                j2 = (int) p[2][ip];
X                p[1][ip] += y[i2+32];
X                p[2][ip] += z[j2+32];
X                i2 += e[i2+32];         j2 += f[j2+32];
X                h[i2][j2] += 1.0;
X                }
X        ts[13] = (double) clock() - ts[13];
X        for (ip=1; ip<=128; ip++)
X                cksum[13] += (double) ip * (p[3][ip]+p[4][ip]+p[1][ip]+
X                        p[2][ip]);
X        for (k=1; k<=64; k++)
X                for (ix=1; ix<=8; ix++)
X                        cksum[13] += (double) k * (double) ix * h[k][ix];
X 
X/* loop 14      1-d particle pusher */
X 
X        init();
X        ts[14] = (double) clock();
X        for (k=1; k<=150; k++)   {
X                ix = (int) grd[k];
X                xi = (double) ix;
X                vx[k] += ex[ix]+(xx[k]-xi)*dex[ix];
X                xx[k] += vx[k]+flx;
X                ir = (int) xx[k];
X                ri = (double) ir;
X                rx1 = xx[k]-ri;
X                ir = abs(ir % 64);
X                xx[k] = ri+rx1;
X                rh[ir] += 1.0-rx1;
X                rh[ir+1] += rx1;
X                }
X        ts[14] = (double) clock() - ts[14];
X        for (k=1; k<=150; k++)
X                cksum[14] += (double) k * (vx[k]+xx[k]);
X        for (k=1; k<=67; k++)
X                cksum[14] += (double) k * rh[k];
X 
X/* time the clock call */
X 
X        ts[15] = (double) clock();
X        ts[15] = (double) clock() - ts[15];
X 
X/* scale= set to convert time to micro-seconds */
X 
X        scale=1.0;
X        rt[15] = ts[15]*scale;
X        printf("clock overhead = %9.2f usec\n",rt[15]);
X 
X        nt = 14.0;
X        t = s = uu = 0.0;
X        for (k=1; k<=nt; k++)    {
X                rt[k] = (ts[k]-ts[15])*scale;
X                t += rt[k];
X                mops[k] = nrops[k]*loops[k];
X                s += (double) mops[k];
X                rpm[k] = 0.0;
X                if (rt[k] != 0.0)
X                        rpm[k] = (double) mops[k]/rt[k];
X                uu += rpm[k];
X                }
X        uu /= (double) nt;
X        s /= t;
X        printf("\nloop         checksum       flops    time    mflops \n");
X        for (k=1; k<=nt; k++)   {
X                printf("%4d  %20.12e%7d%9.1f%9.3f\n",k,cksum[k],mops[k],
X                        rt[k],rpm[k]);
X                if (fabs(cksum[k]-checks[k]) > fabs(checks[k]*MAXERR))
X                        printf("      %20.12e *** expected checksum\n",
X                                checks[k]);
X                }
X        printf("\n   average mflops=%9.3f\n",uu);
X}
Xvoid
Xinit()
X{
X        register int    j, k, l;
X
X        for (k=1; k<=1000; k++)  {
X                x[k] = 1.11;
X                y[k] = 1.123;
X                z[k] = 0.321;
X                }
X
X	for (k=1; k<=500; k++)
X		u[k] = 0.00025;
X 
X        for (k=1; k<=15; k++)    {
X                for (l=1; l<=100; l++)   {
X                        px[k][l] = l;
X                        cx[k][l] = l;
X                        }
X                }
X 
X        for (j=1; j<6; j++)     {
X                for (k=1; k<23; k++)    {
X                        for (l=1; l<3; l++)     {
X                                u1[j][k][l]=k;
X                                u2[j][k][l]=k+k;
X                                u3[j][k][l]=k+k+k;
X                                }
X                        }
X                }
X 
X        for (j=1; j<65; j++)    {
X                for (k=1; k<9; k++)     {
X                        b[j][k] = 1.00025;
X                        c[j][k] = 1.00025;
X                        h[j][k] = 1.00025;
X                        }
X                }
X 
X        for (j=1; j<6; j++)     {
X                bnk1[j] = j*100;
X                bnk2[j] = j*110;
X                bnk3[j] = j*120;
X                bnk4[j] = j*130;
X                bnk5[j] = j*140;
X                }
X 
X        for (j=1; j<5; j++)     {
X                for (k=1; k<513; k++)   {
X                        p[j][k] = 1.00025;
X                        }
X                }
X 
X        for (j=1; j<193; j++)
X                e[j] = f[j] = 1;
X 
X        for (j=1; j<68; j++)    {
X                ex[j] = rh[j] = dex[j] = (double) j;
X                }
X 
X        for (j=1; j<151; j++)   {
X                vx[j] = 0.001;
X                xx[j] = 0.001;
X                grd[j] = (double) (j/8+3);
X                }
X}
END_OF_FILE
if test 13098 -ne `wc -c <'lloops.c'`; then
    echo shar: \"'lloops.c'\" unpacked with wrong size!
fi
# end of 'lloops.c'
fi
echo shar: End of shell archive.
exit 0