Path: utzoo!utgpu!water!watmath!clyde!rutgers!sri-spam!ames!aurora!labrea!decwrl!spar!malcolm
From: malcolm@spar.SPAR.SLB.COM (Malcolm Slaney)
Newsgroups: comp.lang.lisp
Subject: Lisp vs. C Floating Point (Suns)
Message-ID: <536@spar.SPAR.SLB.COM>
Date: 29 Jan 88 19:06:37 GMT
Organization: SPAR - Schlumberger Palo Alto Research
Lines: 325

The times shown in the table below are number of seconds to calculate 10
iterations of a 1024 point FFT.  In this study the C and Lisp version
of the FFT routine are identical.  I started with Harry Barrow's FFT 
benchmark (with declarations added by Lucid) and then translated it
statement by statement into C (thanks to Dave Singer for double checking
this.)  I believe that this is as accurate a comparison as is possible.

Note, all of the Sun tests were measured with a version of Lucid Common
Lisp labelled "Development Environment 2.1.1".  This lisp only does single
precision arithmetic using the 68881.  Thus the only two lines that can
be directly compared are "Single 68881 C" versus "Lucid 2.1.1".

I believe it is now reasonable to conclude that Lisp compilers are as
fast as C compilers.  I don't have any numbers for a prodution quality
Lisp compiler for the Sun/4's.  I don't know of any reason the SPARC 
machine would run compiled Lisp slower than C.

Finally, a word about register declarations in C:  The times shown here
were measured by putting the variables I thought were most accessed into
registers.  I did this with the same effort I normally go to make my own
programs fast.  I did not do any exhaustive testing to find the best set.
When all of the register declarations are removed the 3/160 times got slower
by .8 seconds and all the 2/260 times got slower by about .4 seconds.  The
Sun 4/260 numbers did not change any indicating they are doing global register
optimization or the 4/260 is being completely dominated by the floating point
work.  The Sun 3 no-register times make sense since only the loop iteration
and array addresses were put in registers and thus they are independent of 
the floating point work.
							Malcolm

	***********************************************************
	Time (in seconds) to execute 10 iterations of a 1024 point FFT

			3/160		3/260		4/260

Double FPA C		2.4		1.5		0.8 (*)
Single FPA C		1.3		0.8		0.3 (*)

Double 68881 C		5.0		4.1		0.8 (*)
Single 68881 C		4.8		4.1		0.3 (*)  <======
Lucid 2.1.1		4.7		4.0		         <======

Symbolics Machine	4.7 (No IFU, No FPA - From Gabriel - Duplicated @ SPAR)
Symbolics Machine	3.9 (with either IFU or FPA - From Gabriel)


*	Note that Sun4/260's only have one floating point option.  They use
	a pair of Weitek floating point chips.


And finally, here is the actual C version of the Barrow/Gabriel/Lucid
FFT benchmark.  The original Lisp is in comments.

	***********************************************************
/* 
 * ;; FFT -- This is an FFT benchmark written by Harry Barrow.
 * ;; It tests a variety of floating point operations, including array 
 * ;; references.
 * ;; 
 * ;; Originally written in Lisp.
 * ;; Translated to C by Malcolm@spar.slb.com    January 1988.

 * Thanks to Dave Singer @ SPAR for double checking the translation.

/* 
 *
 * (defvar **fft-re**
 *   (make-array 1025. :element-type 'single-float :initial-element 0.0))
 * (defvar **fft-im**
 *   (make-array 1025. :element-type 'single-float :initial-element 0.0))
 */
float fft_re[1025], fft_im[1025];


/* 
 * (defvar s-pi (float pi 0.0))
 */
#define	s_pi 3.141592653589792434

/*
 * (defun fft (areal aimag)
 *   (declare (type (simple-array single-float (*)) areal aimag))
 */
fft(areal, aimag)
float areal[], aimag[];
{
/*	  (prog* ((ar areal)
 *		  (ai aimag)
 *		  (i 1)
 *		  (j 0)
 *		  (k 0)
 *		  (m 0)				;compute m = log(n)
 *		  (n (1- (array-dimension ar 0)))
 *		  (nv2 (floor n 2))
 *		  (le 0) (le1 0) (ip 0)
 *		  (ur 0.0) (ui 0.0) (wr 0.0) (wi 0.0) (tr 0.0) (ti 0.0))
 *          (declare (type fixnum i j k n nv2 m le le1 ip))
 *          (declare (type (simple-array single-float (*)) ar ai))
 *          (declare (single-float ur ui wr wi tr ti))
 */
	register float *ar = areal, *ai = aimag;
	register int	i = 1, j = 0, k = 0, m = 0; 
	int	n = 1024, nv2 = 512, le = 0,
		le1 = 0, ip = 0;
	float	ur = 0.0, ui = 0.0, wr = 0.0, wi = 0.0, tr = 0.0, ti = 0.0;
	register int	l;

/*	    l1 (cond ((< i n)
 *		      (setq m (the fixnum (1+ m))
 *			    i (the fixnum (+ i i)))
 *		      (go l1)))
 */
	l1 :
		if (i < n){
			m += 1;
			i += i;
			goto l1;
		}

/*
 *	    (cond ((not (equal n (the fixnum (expt 2 m))))
 *		   (princ "error ... array size not a power of two.")
 *		   (read)
 *		   (return (terpri))))
 */

/*
 *	    (setq j 1				;interchange elements
 *		  i 1)				;in bit-reversed order
 */
	j = 1;
	i = 1;

/*
 *	    l3 (cond ((< i j)
 *		      (setq tr (aref ar j)
 *			    ti (aref ai j))
 *		      (setf (aref ar j) (aref ar i))
 *		      (setf (aref ai j) (aref ai i))
 *		      (setf (aref ar i) tr)
 *		      (setf (aref ai i) ti)))
 */
	l3:
		if (i < j){
			tr = ar[j];
			ti = ai[j];
			ar[j] = ar[i];
			ai[j] = ai[i];
			ar[i] = tr;
			ai[i] = ti;
		}

/*
 *	    (setq k nv2)
 *	    l6 (cond ((< k j)
 *		      (setq j (the fixnum (- j k))
 *			    k (the fixnum (/ k 2)))
 *		      (go l6)))
 *	    (setq j (the fixnum (+ j k))
 *		  i (the fixnum (1+ i)))
 */
		k = nv2;
	l6:
		if (k < j){
			j -= k;
			k /= 2;
			goto l6;
		}
	j += k;
	i += 1;

/*
 *	(cond ((< i n)
 *	       (go l3)))
 */
	if (i < n)
		goto l3;

/*
 *	    (do ((l 1 (the fixnum (1+ l))))
 *		((> (the fixnum l) m))		;loop thru stages
 *	      (declare (type fixnum l))
 *	      (setq le (the fixnum (expt 2 l))
 *		    le1 (floor le 2) ;;(the (values fixnum fixnum) (floor le 2))
 *		    ur 1.0
 *		    ui 0.0			;This used to be fixnum 0 --smh
 *		    wr (cos (/ s-pi le1))
 *		    wi (sin (/ s-pi le1)))
 */
	for (l = 1; l <= m; l++){
		le = 1 << l;
		le1 = le / 2;
		ur = 1.0;
		ui = 0.0;
		wr = cos(s_pi / le1);
		wi = sin(s_pi / le1);

/*
 *	      (do ((j 1 (the fixnum (1+ (the fixnum j)))))
 *		  ((> (the fixnum j) le1))	;loop thru butterflies
 *		(declare (type fixnum j))
 *		(do ((i j (the fixnum (+ i le))))
 *		    ((> (the fixnum i) n))	;do a butterfly
 *		  (declare (type fixnum i))
 *		  (setq ip (the fixnum (+ i le1))
 *			tr (- (* (aref ar ip) ur)
 *			      (* (aref ai ip) ui))
 *			ti (+ (* (aref ar ip) ui)
 *			      (* (aref ai ip) ur)))
 *		  (setf (aref ar ip) (- (aref ar i) tr))
 *		  (setf (aref ai ip) (- (aref ai i) ti))
 *		  (setf (aref ar i) (+ (aref ar i) tr))
 *		  (setf (aref ai i) (+ (aref ai i) ti))))
 *	      (setq tr (- (* ur wr) (* ui wi))
 *		    ti (+ (* ur wi) (* ui wr))
 *		    ur tr
 *		    ui ti))
 *	    (return t)))
 */
		for (j = 1; j <= le1; j++){
			for (i = j; i <= n; i += le){
				ip = i + le1;
				tr = ar[ip]*ur - ai[ip] * ui;
				ti = ar[ip]*ui + ai[ip] * ur;
				ar[ip] = ar[i] - tr;
				ai[ip] = ai[i] - ti;
				ar[i] += tr;
				ai[i] += ti;
			}
		}
		tr = ur * wr - ui * wi;
		ti = ur * wi + wi * wr;
		ur = tr;
		ui = ti;
	}
}

/*
 *	 (defun fft-bench ()
 *	  (dotimes (i 10)
 *	    (fft **fft-re** **fft-im**)))
 */

fft_bench(){
	int	i;
	for (i=0; i< 10; i++)
		fft(fft_re, fft_im);
}


/*
 *	(defun testfft ()
 *	  (format *trace-output* "; doing fft: (fft-bench)~%") 
 *	  (print (time (fft-bench))))
 */

/*
 *	(defun clear-fft ()
 *	  (dotimes (i 1025)
 *	    (setf (aref **fft-re** i) 0.0
 *		  (aref **fft-im** i) 0.0))
 *	  (values))
 */

clear_fft(){
	int	i;
	for (i=0;i<1025;i++)
		fft_re[i] = fft_im[i] = 0;
}

/*
 *	(defun setup-fft-component (theta &optional (phase 0.0))
 *	  (let ((f (* 2 pi theta))
 *		(c (cos (* 0.5 pi phase)))
 *		(s (sin (* 0.5 pi phase))))
 *	    (dotimes (i 1025)
 *	      (let ((x (sin (* f (/ i 1024.0)))))
 *		(incf (aref **fft-re** i) (float (* c x) 0.0))
 *		(incf (aref **fft-im** i) (float (* s x) 0.0)))))
 *	  (values))
 */
 setup_fft_component(theta,phase)
 float	theta, phase;
 {
	float	f, c, s;
	int	i;
	f = 2 * s_pi * theta;
	c = cos(0.5 * s_pi * phase);
	s = sin(0.5 * s_pi * phase);
	for (i = 0; i < 1025; i++){
		float	x;
		x = sin(f * (i / 1024.0));
		fft_re[i] += c * x;
		fft_im[i] += s * x;
	}
}

/*
 *	(defvar fft-delta 0.0001)
 */
#define	fft_delta 0.0001

/*
 *	(defun print-fft ()
 *	  (dotimes (i 1025)
 *	    (let ((re (aref **fft-re** i))
 *		  (im (aref **fft-im** i)))
 *	      (unless (and (< (abs re) fft-delta) (< (abs im) fft-delta))
 *		(format t "~4d  ~10f ~10f~%" i re im))))
 *	  (values))
 */
 print_fft(){
	int	i;
	for (i = 0; i< 1025; i++){
		float	re, im;
		re = fft_re[i];
		im = fft_im[i];
		if (abs(re) > fft_delta || abs(im) > fft_delta)
			printf("%4d  %10f %10f\n", i, re, im);
	}
}

main(){
	fft_bench();
}