Path: utzoo!utgpu!water!watmath!uunet!lll-winken!killer!ames!claris!apple!amdahl!ems!quest!zeno!gene
From: gene@zeno.MN.ORG (Gene H. Olson)
Newsgroups: unix-pc.general
Subject: Re: 3b1 ulrem() bug
Keywords: 3b1 "unsigned long remainder" ulrem bug
Message-ID: <12@zeno.MN.ORG>
Date: 24 Jul 88 03:45:11 GMT
References: <5552@ihlpg.ATT.COM>
Reply-To: gene@zeno.UUCP (Gene H. Olson)
Distribution: unix-pc
Organization: Smartware Consulting, Minneapolis, MN
Lines: 270

In article <5552@ihlpg.ATT.COM> bgb@ihlpg.ATT.COM (Beuning) writes:
>3b1 owners,
>
>	I believe I have found a bug in unsigned long
>remainder arithmetic.  If you compile and execute this
>program on your 3b1, the value of 'c' is incorrect.

		  (He gives a very good example)

I have a fix for you.   The following set of long multiply/divide
routines do not have the bug, and are faster than the standard ATT
routines by a significant margin (enough to bump the dhrystone by 100).

They won't work directly with the shared library, but if you use
the `scc' script I just posted to unix-pc.sources you will have
no trouble using it on shared library programs also.

The only hitch is that shared library modules won't use them (I think)
just modules which are not shared.   So your program may still get
the wrong answer if the library routines are the ones finding the problem.

I wrote these routines, and I hereby place them in the public domain.
I would be honored if ATT would incorporate them to fix the
bug reported above.

Gene H. Olson
Smartware Consulting
Minneapolis, MN
amdahl!bungia!zeno!gene


------------------- Cut here ----------------
#***************************************************
#*    Long and ulong Multiply routines.            *
#***************************************************

	global	ulmul
	global	lmul
ulmul:
lmul:
	mov.l	8(%sp),%d0	# Get d0

	global	lmul__
	global	ulmul__
lmul__:
ulmul__:
	mov.l	4(%sp),%d1	# Get parameters

	mov.l	%d2,%a0		# Save d2, d3
	mov.l	%d3,%a1

	mov.l	%d0,%d2		# Copy & swap operands
	mov.l	%d1,%d3
	swap.w	%d2
	swap.w	%d3

	mulu.w	%d1,%d2		# Do the multiply
	mulu.w	%d0,%d3
	mulu.w	%d1,%d0

	add.w	%d3,%d2		# Add up components
	swap.w	%d2
	clr.w	%d2
	add.l	%d2,%d0

	mov.l	%a0,%d2
	mov.l	%a1,%d3
	rts


#********************************************************
#*	Signed Long Divide                              *
#********************************************************

	global	ldiv__
ldiv__:
	mov.l	4(%sp),%d1	# Get operands

sdiv:
	tst.l	%d0		# Test dividend sign
	bpl	sdiv2		# - Positive

	neg.l	%d0		# Make positive

	tst.l	%d1		# Test divisor sign
	bpl	sdiv1		# - positive

	neg.l	%d1		# Make positive

	bsr	udiv		# Divide -x / +y

	neg.l	%d1		# Make remainder negative
	rts

sdiv1:
	bsr	udiv		# Divide -x / -y

	neg.l	%d0		# Negate quotient
	neg.l	%d1		# Negate remainder
	rts

sdiv2:
	tst.l	%d1		# Test divisor sign
	bpl	udiv		# - Divide +x / +y

	neg.l	%d1		# Make it positive

	bsr	udiv		# Divide +x / -y

	neg.l	%d0		# Negate quotient
	rts


#*********************************************************
#*	Unsigned Long Divide                             *
#*********************************************************

	global	uldiv__
uldiv__:
	mov.l	4(%sp),%d1	# Get operands

udiv:
	mov.l	%d2,%a0		# Save d2

#	Check if the MSB of the divisor are zero.  If so,
#	we can do an easy 32/16 -> 32 bit divide.

	swap.w	%d1		# Is the divisor 16 bits?
	mov.w	%d1,%d2
	bne	udiv2		# - no, 32 bit divide needed

	swap.w	%d0		# Setup MSB divide
	swap.w	%d1
	swap.w	%d2
	mov.w	%d0,%d2
	beq	udiv1		# - msb divide unnecessary

	divu.w	%d1,%d2		# Do MSB divide
	mov.w	%d2,%d0

udiv1:
	swap.w	%d0		# Do LSB divide
	mov.w	%d0,%d2
	divu.w	%d1,%d2

	mov.w	%d2,%d0		# Put quotient in d0

	swap.w	%d2		# Put remainder in d1
	mov.w	%d2,%d1

	mov.l	%a0,%d2		# Restore d2
	rts

#	Upper bits were non-zero.  We have to do a
#	32/32 -> 16 bit divide.
#
#	For the divide to be efficient, we need to first
#	normalize the divisor,  and then shift the dividend
#	left by the same amount.

udiv2:
	mov.l	%d3,%a1		# Save d3

	mov.l	&16,%d3		# Initialize shift count

	cmp.w	%d1,&0x0080	# Normalize 8 bits?
	bcc	udiv3		# - not needed

	rol.l	&8,%d1		# Do the shift
	sub.w	&8,%d3		# Adjust shift count

udiv3:
	cmp.w	%d1,&0x0800	# Normalize 4 bits?
	bcc	udiv4		# - not needed

	rol.l	&4,%d1		# Do the shift
	sub.w	&4,%d3		# Adjust shift count

udiv4:
	cmp.w	%d1,&0x2000	# Normalize 2 bits?
	bcc	udiv5		# - not needed

	rol.l	&2,%d1		# Do the shift
	sub.w	&2,%d3		# Adjust shift count

udiv5:
	tst.w	%d1		# Single bit normalization needed?
	bmi	udiv6		# - no

	rol.l	&1,%d1		# Do the shift
	sub.w	&1,%d3		# Adjust shift count

udiv6:
	mov.w	%d0,%d2		# Shift dividend left by same amount
	lsr.l	%d3,%d0		# d0 = x01
	swap.w	%d2		# d2 = x2
	clr.w	%d2
	lsr.l	%d3,%d2
	swap.w	%d3

#	Divide x012 by y01.
#
#	q0 = x01 / y0
#	r012 = (x01 % y0) * m1 + x2 - q0 * y1
#	if (r01 < 0) {
#		r01 += y01 ;
#		q0 -= 1 ;
#		}

	divu.w	%d1,%d0		# q0 = x01 / y0

	mov.w	%d0,%d3		# r01 = (x01 % y0) * m1 - q0 * y1
	mov.w	%d2,%d0
	mov.w	%d3,%d2
	swap.w	%d1
	mulu.w	%d1,%d2
	sub.l	%d2,%d0

	bcc	udiv7		# - r01 >= 0

	sub.w	&1,%d3		# q0 -= 1
	add.l	%d1,%d0		# r01 += y01

#	Restore remainder.

udiv7:
	mov.l	&0,%d1		# Get quotient
	mov.w	%d3,%d1

	swap.w	%d3		# Get remainder
	swap.w	%d0
	rol.l	%d3,%d0

	exg	%d0,%d1		# d0 = quotient, d1=remainder

	mov.l	%a0,%d2		# Restore d2, d3
	mov.l	%a1,%d3
	rts


#*******************************************************
#*      Signed remainder                               *
#*******************************************************

	global	lrem__
lrem__:
	mov.l	4(%sp),%d1	# Get operands

	bsr	sdiv		# Do divide/modulus

	exg	%d0,%d1		# Get remainder
	rts


#********************************************************
#*      Unsigned remainder                              *
#********************************************************


	global	ulrem__
ulrem__:
	mov.l	4(%sp),%d1	# Get operands

	bsr	udiv		# Do divide/modulus

	exg	%d0,%d1		# Get remainder
	rts