Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!uunet!overload!dillon
From: dillon@overload.Berkeley.CA.US (Matthew Dillon)
Newsgroups: comp.sys.amiga.programmer
Subject: Re:  New life for MOVEM!
Message-ID: <dillon.4191@overload.Berkeley.CA.US>
Date: 22 Feb 91 20:10:08 GMT
References: <1991Feb11.160212.7749@vax1.tcd.ie> <19100@cbmvax.commodore.com> <1991Feb21.115145.7828@vax1.tcd.ie>
Organization: Not an Organization
Lines: 153

In article <1991Feb21.115145.7828@vax1.tcd.ie> smcgerty@vax1.tcd.ie writes:
>In article <19100@cbmvax.commodore.com>, jesup@cbmvax.commodore.com (Randell Jesup) writes:
>> In article <1991Feb11.160212.7749@vax1.tcd.ie> smcgerty@vax1.tcd.ie writes:
>>...
>
>Hey, I don't doubt the OS is very fast and neat; we all use it quite often, and
>its great etc etc.. However, as far as giving people a deeper understanding of
>68000 programming is concerned , an example of a movem-loop in assembly is a
>bit better than a recommendation to use an OS routine.
>
>By writing my example, I wasn't really trying to fulfill someone's desire to
>have a fast-copy-memory routine, but instead I wanted to stimulate an interest
>in the techniques of using the 68000 efficiently.
>
>Re-inventing the wheel is often the best way of educating yourself. I find it
>helpful, and I reckon others do too.
>...

    I generally post this about once a year when the question comes up..
    here is a fully working MOVMEM() call that optimizes via MOVEM:

					-Matt

    Matthew Dillon	    dillon@Overload.Berkeley.CA.US
    891 Regal Rd.	    uunet.uu.net!overload!dillon
    Berkeley, Ca. 94708
    USA


		;   MOVMEM.A
		;
		;   (c)Copyright 1990, Matthew Dillon, All Rights Reserved

		section text,code

		;   movmem(src, dst, len)   (ANSI)
		;   bcopy(src, dst, len)    (UNIX)
		;	    A0	A1   D0     DICE-REG
		;	    A0	A1   D0     internal
		;	 4(sp) 8(sp) 12(sp)
		;
		;   The memory move algorithm is somewhat more of a mess
		;   since we must do it either ascending or decending.

		xdef	_movmem
		xdef	_bcopy	    ; UNIX
		xdef	@movmem
		xdef	@bcopy	    ; UNIX


_bcopy:
_movmem:	move.l	4(sp),A0
		move.l	8(sp),A1
		move.l	12(sp),D0
@bcopy:
@movmem:
		cmp.l	A0,A1		;move to self
		beq	xbmend
		bls	xbmup
xbmdown 	adda.l	D0,A0		;descending copy
		adda.l	D0,A1
		move.w	A0,D1		;CHECK WORD ALIGNED
		lsr.l	#1,D1
		bcs	xbmdown1
		move.w	A1,D1
		lsr.l	#1,D1
		bcs	xbmdown1
		cmp.l	#259,D0 	    ;chosen by calculation.
		bcs	xbmdown8

		move.l	D0,D1		    ;overhead for bmd44: ~360
		divu	#44,D1
		bvs	xbmdown8	    ;too big (> 2,883,540)
		movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
		move.l	#44,D0
		bra	xbmd44b
xbmd44a 	sub.l	D0,A0		    ;8		total 214/44bytes
		movem.l (A0),D2-D7/A2-A6    ;12 + 8*11  4.86 cycles/byte
		movem.l D2-D7/A2-A6,-(A1)   ; 8 + 8*11
xbmd44b 	dbf	D1,xbmd44a	    ;10
		swap	D1		    ;D0<15:7> already contain 0
		move.w	D1,D0		    ;D0 = remainder
		movem.l (sp)+,D2-D7/A2-A6

xbmdown8	move.w	D0,D1		    ;D1<2:0> = #bytes left later
		lsr.l	#3,D0		    ;divide by 8
		bra	xbmd8b
xbmd8a		move.l	-(A0),-(A1)         ;20         total 50/8bytes
		move.l	-(A0),-(A1)         ;20         = 6.25 cycles/byte
xbmd8b		dbf	D0,xbmd8a	    ;10
		sub.l	#$10000,D0
		bcc	xbmd8a
		move.w	D1,D0		    ;D0 = 0 to 7 bytes
		and.l	#7,D0
		bne	xbmdown1
xbmend
		move.l	8(sp),D0
		rts

xbmd1a		move.b	-(A0),-(A1)         ;12         total 22/byte
xbmdown1				    ;		= 22 cycles/byte
xbmd1b		dbf	D0,xbmd1a	    ;10
		sub.l	#$10000,D0
		bcc	xbmd1a
		move.l	8(sp),D0
		rts

xbmup		move.w	A0,D1		    ;CHECK WORD ALIGNED
		lsr.l	#1,D1
		bcs	xbmup1
		move.w	A1,D1
		lsr.l	#1,D1
		bcs	xbmup1
		cmp.l	#259,D0 	    ;chosen by calculation
		bcs	xbmup8

		move.l	D0,D1		    ;overhead for bmu44: ~360
		divu	#44,D1
		bvs	xbmup8		    ;too big (> 2,883,540)
		movem.l D2-D7/A2-A6,-(sp)   ;use D2-D7/A2-A6 (11 regs)
		move.l	#44,D0
		bra	xbmu44b
xbmu44a 	movem.l (A0)+,D2-D7/A2-A6   ;12 + 8*11  ttl 214/44bytes
		movem.l D2-D7/A2-A6,(A1)    ;8  + 8*11  4.86 cycles/byte
		add.l	D0,A1		    ;8
xbmu44b 	dbf	D1,xbmu44a	    ;10
		swap	D1		    ;D0<15:7> already contain 0
		move.w	D1,D0		    ;D0 = remainder
		movem.l (sp)+,D2-D7/A2-A6

xbmup8		move.w	D0,D1		    ;D1<2:0> = #bytes left later
		lsr.l	#3,D0		    ;divide by 8
		bra	xbmu8b
xbmu8a		move.l	(A0)+,(A1)+         ;20         total 50/8bytes
		move.l	(A0)+,(A1)+         ;20         = 6.25 cycles/byte
xbmu8b		dbf	D0,xbmu8a	    ;10
		sub.l	#$10000,D0
		bcc	xbmu8a
		move.w	D1,D0		    ;D0 = 0 to 7 bytes
		and.l	#7,D0
		bne	xbmup1
		move.l	8(sp),D0
		rts

xbmu1a		move.b	(A0)+,(A1)+
xbmup1
xbmu1b		dbf	D0,xbmu1a
		sub.l	#$10000,D0
		bcc	xbmu1a
		move.l	8(sp),D0
		rts

		END