Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!usc!zaphod.mps.ohio-state.edu!rpi!leah!bingvaxu!vu0310
From: vu0310@bingvaxu.cc.binghamton.edu (R. Kym Horsell)
Newsgroups: comp.arch
Subject: benchmark for evaluating extended precision
Summary: good enough?
Keywords: extended precision,multiply,benchmark,arithmetic
Message-ID: <3989@bingvaxu.cc.binghamton.edu>
Date: 12 Sep 90 06:52:50 GMT
Reply-To: vu0310@bingvaxu.cc.binghamton.edu.cc.binghamton.edu (R. Kym Horsell)
Organization: SUNY Binghamton, NY
Lines: 262


After a number of private communications, I've managed to render
one of the little benchmarks I have presentable enough to post,
along with some performance figures from different machines
(basically, its put up or shut up time).

It has been claimed that a lack of 32x32->64 multiplication
makes a factor of 10 difference in the running time of
typical extended precision arithmetic routines. Although it
obviously makes _a_ difference in run time I do not measure
an order of magnitude difference.

Following is a table of the total runtime for a particular
benchmark that does a few simple-minded products. The
figures in parens are those with optimization on.

---------------------------------------------------------------------------
Machine		32x32->32	8x8->16		ratio	bc

VAX 6440	
		1.3		3.9		3.0
		(0.88)		(2.5)		2.8

VAX 8530	
cc		3.2		9.5		3.0	3.1!!!
		(2.9)		(9.3)		3.2
gcc		3.1		8.4		2.7
		(1.9)		(6.1)		3.2

SUN 3/60
		4.8		12.2		2.54	7.8
		(2.5)		(6.2)

Sun Sparc 1
		2.4		6.6		2.75	2.5!!!
		(1.1)		(3.3)		3.0
---------------------------------------------------------------------------

The benchmark itself follows:

#include	<stdio.h>
#include	<malloc.h>

#define	DEBUG	/***/
#define	DOUBLE

#ifdef	DOUBLE
#define	NUM	15
#define	MAX	100
#define	SHORT	unsigned short
#define	LONG	unsigned long
#else
#define	NUM	15
#define	MAX	200
#define	SHORT	unsigned char
#define	LONG	unsigned short
#endif	/*DOUBLE*/

#ifdef	__STDC__
#define	_(x)	x
#else
#define	_(x)	()
#endif

#ifdef	DEBUG
#define	Assert(x)	{if(!(x)){fprintf(stderr,"Assert failed: line %d\n",\
				__LINE__); exit(-1);}}
#else
#define	Assert(x)
#endif

typedef struct {
	SHORT *top,*bot,*max;
	} B;

B *a[NUM][NUM],*b[NUM][NUM];

B *zap _((B*));
B *newb _((void));
void kill _((B*));
void part1 _((void));
void part2 _((void));
B *add2 _((B*,B*));
B *fact _((int));
B *mul3 _((B*,B*,B*));
B *muladd _((B*,B*,int));

main() {
	printf("sizeof SHORT=%d\n",8*sizeof(SHORT));
	printf("sizeof LONG=%d\n",8*sizeof(LONG));
	part1();
	part2();
	exit(0);
	}

void part1() {
	int i,j;

	for(i=0; i<NUM; ++i)
	for(j=0; j<NUM; ++j) {
		b[i][j] = fact(i+j);
		a[j][i] = fact(i*j);
		}
	}

void part2() {
	int i,j,k;
	B *t1 = newb(), *t2 = newb();

	for(i = 0; i<NUM; ++i)
	for(j = 0; j<NUM; ++j) {
		zap(t1);
		for(k = 0; k<NUM; ++k) {
			mul3(t2,a[i][k],b[k][j]);
			add2(t1,t2);
			}
		}
	kill(t1);
	kill(t2);
	}

B *fact(x) {
	B *res = newb();
	SHORT *ptr;
	LONG cry;

	res->bot[0] = 1;
	while(x-->1) {
		cry = 0;
		for(ptr=res->bot; ptr<=res->top; ) {
			cry += (LONG)x**ptr;
			*ptr++ = cry;
			cry >>= 8*sizeof(SHORT);
			}
		if(cry) *(res->top = ptr) = cry;
		}
	Assert(res->top<res->max);
	Assert(res->bot<=res->top);
	return res;
	}

/* x = y*z */
B *mul3(x,y,z) B *x,*y,*z; {
	SHORT *ptr;

	Assert(y->top<y->max);
	Assert(y->bot<=y->top);
	Assert(z->top<z->max);
	Assert(z->bot<=z->top);
	zap(x);
	for(ptr = z->bot; ptr<=z->top; )
		muladd(x,y,*ptr++);
	return x;
	}

/* x += y*k */
B *muladd(x,y,k) B *x,*y; {
	SHORT *ptr,*ptr2;
	LONG cry = 0;

	Assert(x->top<x->max);
	Assert(x->bot<=x->top);
	Assert(y->top<y->max);
	Assert(y->bot<=y->top);
	ptr = x->bot;
	for(ptr2 = y->bot; ptr2<=y->top; ) {
		Assert(ptr<x->max);
		cry += (LONG)k**ptr2++;
		if(ptr<=x->top) cry += *ptr;
		*ptr++ = cry;
		cry >>= 8*sizeof(SHORT);
		}
	while(cry) {
		cry += *ptr;
		*(x->top = ptr++) = cry;
		cry >>= 8*sizeof(SHORT);
		}
	return x;
	}

/* x += y */
B *add2(x,y) B *x,*y; {
	SHORT cry;
	SHORT *ptr,*ptr2;

	Assert(y->top<y->max);
	Assert(y->bot<=y->top);
	for(ptr = x->bot,ptr2 = y->bot; ptr2<=y->top; ) {
		cry += (LONG)*ptr+*ptr2++;
		*ptr++ = cry;
		cry >>= 8*sizeof(SHORT);
		}
	if(cry) *(ptr = x->top) = cry;
	return x;
	}

B *newb() {
	B *tmp;

	tmp = (B*)malloc(sizeof(B));
	Assert(tmp);
	tmp->top = tmp->bot = (SHORT*)malloc(MAX*sizeof(SHORT));
	Assert(tmp->top);
	tmp->max = tmp->bot + MAX;
	return tmp;
	}

void kill(x) B*x; {

	free(x->bot);
	free(x);
	}

B *zap(x) B*x; {

	x->top = x->bot;
	x->bot[0] = 0;
	return x;
	}

---------------------------------------------------------------------------

A word of explanation -- on the machines tested `short' was 16,
`char' was 8 and `long' (_sometimes_ the same size as `int') was 32
bits. The idea is to compare the running times of the two versions
of the program created by alternately defining and undefining `DOUBLE'.

With DOUBLE _defined_ the program uses 32x32->32 arithmetic
to essentially perform 16x16->32 bit arithmetic. In std C this
is performed by

	int x,y;
	long z;
	z = (long)x * y;

extends both operands (the order of loading operands into registers is,
of course, not defined to allow optimization some elbow-room).

With DOUBLE _undefined_ (comment out that line) the program forms
the same computation using 16x16->16 in order to perform
essentially 8x8->16 bit arithmetic. Arrangement similar to above.
(Of course the program uses unsigned arithmetic in these contexts).

I think the results show that nxn->2n bit arithmetic can
be replaced, at least for _this_ benchmark, by 2n x 2n -> 2n
with a loss of about a factor of 3.

Another point to note (although Robert Silverman is not impressed)
is the performance of `bc'. Taking into consideration that
it has no assembler coding (I am informed so by those-who-know
at Sun) and, I understand, performs its arithmetic on _bytes_
that code simply `higits' [0,99] it compares remarkably well
with hard-coded (though not very good) C.

I would appreciate someone pointing out the error in my reasoning.

-Kym Horsell.

P.S. The benchmark was _deliberately_ kept rather simple; I
	wanted to measure the performance of _multiply_ in
	the context of basic extended precision arithmetic, not
	the memory or i/o subsystems.