Path: utzoo!utgpu!news-server.csri.toronto.edu!rpi!zaphod.mps.ohio-state.edu!swrinde!ucsd!dog.ee.lbl.gov!ucbvax!bloom-beacon!eru!hagbard!sunic!dkuug!diku!bombadil
From: bombadil@diku.dk (Kristian Nielsen)
Newsgroups: comp.sys.amiga.programmer
Subject: Re: Fast 3d Graphics (long)
Keywords: fast 3d optimizing assembly cycles
Message-ID: <1991Apr5.161952.26178@odin.diku.dk>
Date: 5 Apr 91 16:19:52 GMT
References: <9529@star.cs.vu.nl> <1047@cbmger.UUCP> <20338@cbmvax.commodore.com> <9554@star.cs.vu.nl>
Sender: news@odin.diku.dk (Netnews System)
Organization: Department of Computer Science, U of Copenhagen
Lines: 701

mmblom@cs.vu.nl (Marc Blom) writes:

> 
>>  I seem to recall someone posted a 'really fast' line drawing 
>>routine a while back. It was highly optmized for speed using the
>>CPU (not the blitter. It also was very long because loops were unrolled
>>and it had lots of special cases for different line slopes.)
> 
>Love to hear from the one that posted this ! (or someone who has it)
> 
> 
  I think this probably refers to a rutine posted by John Schultz; I'll try
and see if I've still got it, or he might repost (again :-) ). However,
meanwhile, you might want to take a look at this. It implements line draws
to a 4-plane bitmap, and includes a small demo. The main trick is not to
operate on x/y coords in the drawing loop, but instead manipulate screen
addresses and masks; also, plotting points is done while avoiding looping
through each bitplane.
  Note that the slope of a line is calculated using a divide instruction -
this could be optimised by using a slightly different algorithm. Also, I
agree - this rutine IS a bit long.


	Kristian

Following long source (for Hisoft DevPac 2):

* Test-call.

         INCDIR   Include/

         INCLUDE  exec/exec_lib.i
         INCLUDE  intuition/intuition.i
         INCLUDE  intuition/intuition_lib.i

TEST
         moveq    #0,d0
         lea      iname(pc),a1
         movea.l  4.w,a6
         jsr      _LVOOpenLibrary(a6)
         move.l   d0,ibase
         beq      noint
         movea.l  d0,a6

         lea      ns(pc),a0
         jsr      _LVOOpenScreen(a6)
         move.l   d0,screen
         beq      noscreen
         move.l   d0,scrptr

         lea      nw(pc),a0
         jsr      _LVOOpenWindow(a6)
         move.l   d0,window
         beq      nowind
         movea.l  d0,a4
         movea.l  wd_RPort(a4),a4
         movea.l  rp_BitMap(a4),a4
         lea      bm_Planes(a4),a4

         moveq    #5,d7
loop
         movea.l  screen(pc),a0
         move.w   sc_MouseX(a0),d0
         move.w   sc_MouseY(a0),d1

         move.w   #160,d2
         move.w   #128,d3
         movem.l  (a4),a0-a3
         addq     #1,d7
         andi.w   #$000f,d7
         movem.l  d0-d7/a0-a6,-(a7)
         bsr      CpuLine
         movem.l  (a7)+,d0-d7/a0-a6

         movea.l  4.w,a6
         movea.l  window(pc),a0
         movea.l  wd_UserPort(a0),a0
         jsr      _LVOGetMsg(a6)
         tst.l    d0
         beq.s    loop

         movea.l  d0,a1
         jsr      _LVOReplyMsg(a6)


         movea.l  window(pc),a0
         movea.l  ibase(pc),a6
         jsr      _LVOCloseWindow(a6)
nowind
         movea.l  screen(pc),a0
         jsr      _LVOCloseScreen(a6)
noscreen
         movea.l  4.w,a6
         movea.l  ibase(pc),a1
         jsr      _LVOCloseLibrary(a6)
noint
         rts

ns
         dc.w     0,0,320,256,4
         dc.b     -1,-1
         dc.w     0
         dc.w     CUSTOMSCREEN
         dc.l     0,0,0,0

nw       dc.w     0,1,320,255
         dc.b     -1,-1
         dc.l     CLOSEWINDOW
         dc.l     WINDOWCLOSE|WINDOWDRAG
         dc.l     0,0
         dc.l     windowtitle
scrptr   ds.l     1
         dc.l     0
         dc.w     0,0,0,0,CUSTOMSCREEN

window   ds.l     1
screen   ds.l     1
ibase    ds.l     1
iname    dc.b     'intuition.library',0
windowtitle
         dc.b     '*FAST*! Cpu line rutine - demo.',0
         EVEN

************************************************************************
* Fast CPU line rutine.
*
* Main Entry point: Draw a line from (d0,d1) to (d2,d3).
* a0-a3 are plane pointers.
* d7 is line color 0-15.
************************************************************************
CpuLine
         cmp.w    d1,d3             ;For simplicy, we don't ever draw a line
         bge.s    CpuLine2          ;upwards; instead, the points are swapped.

         exg      d0,d2
         exg      d1,d3

;Secondary entry point: Enter here if you know for sure that d1<=d3.
CpuLine2
* Calculate jump-offset from color-value.
         add.w    d7,d7
         add.w    d7,d7

* Calculation of the memory address of the starting point.
* NOTE! This is specific for a 40-byte wide screen.
* (It could be argued that this is not very general... also I am not certain
* that a mulu wouldn't be quicker, at least on some members of the 680xx
* family. Still room for improvement....
* Also, I use only WORD access to display memory. This makes for full speed
* in old 16bit chipmem, but is slower (I guess) than longword access on an
* A3000, should you be so lucky as to own one of those beasts. I guess that
* there are just too many different amigas out there for the perfect optimi-
* zation.

         move.w   d1,d4
         add.w    d4,d4
         add.w    d4,d4             ;y*4
         add.w    d1,d4             ;y*5
         lsl.w    #3,d4             ;*40
         move.w   d0,d5
         lsr.w    #4,d5
         add.w    d5,d5
         add.w    d5,d4             ;d4=offset into bitplanes.
         add.w    d4,a0
         add.w    d4,a1
         add.w    d4,a2
         add.w    d4,a3

         move.w   d0,d4
         andi.w   #$000f,d4         ;d4=bit no (from the left) of first point

* Now determine whether the line slants to the left or to the right.
         
         sub.w    d1,d3             ;The end-point isn't really usefull, but
         sub.w    d0,d2             ;the vector from (d0,d1) is!
         blt      lineleft          ;Our line is slanting to the left.

* Initialising some values.
         move.w   #$8000,d0         ;Mask for set pixel.
         move.w   #$7fff,d1         ;Mask for clear pixel.
* (Yes I know - d0 is not needed for color 0 + same for d1. Feel free to
* optimize further if you use these colors a lot...)

         move.w   d1,d5             ;Store just less than 0.5 for rounding.
         ror.w    d4,d0
         ror.w    d4,d1             ;Get bit in correct position.
;NOTE - Change this for different plane width.
         moveq    #40,d4            ;Width of planes in bytes.

* To draw the line with no 'holes' in it (and no overlap of points either),
* either the x-coord or the y-coord has to be changed by one for each point
* plotted, while the other is only changed occationally.

         cmp.w    d2,d3
         blt      alwaysright       ;Jump if x is to be changed always.


* Now we have to calculate the 'slant' of the line, ie. what persentage
* of the time do we have to increment the x-coord.
* Now if only i knew which of divu and divs were faster....
* First, however, avoid division by zero.

         tst.w    d3
         beq      singlepoint       ;Branch to draw only a single point.
* Need special case for d2=d3 to avoid overflow.
         cmp.w    d2,d3
         beq.s    deg45A
         swap     d2
         clr.w    d2                ;Need to calculate x*2^16/y.
         divu     d3,d2             ;d2 now contains line-slant.

         jmp      jump_tabA(pc,d7.w);Pick right loop from color.
deg45A
         move.l   #$FFFF,d2
         jmp      jump_tabA(pc,d7.w);Pick right loop from color.

jump_tabA
* This junk just creates 16 branches to labels tightloopA0..15.
* The most tricky bit is the 'label\<num>' bit. This expands to a label
* with the decimal representation of num at its end, so if num=5, it
* gives 'label5'. With my Devpac 2, this works nicely; hopefully it is
* also supported by other assemblers (else buy Devpac - it's great!)
bradefA  MACRO
         bra      tightloopA\<num>
         ENDM

num      SET      0
         REPT     16
         bradefA
num      SET      num+1
         ENDR

* This creates the 16 loops, one for each color.

loopdefA MACRO
* This is the loop that needs to be optimized real hard.
* It outputs one point with only
*  always  11 instructions, sometimes 14, occasionally 18.
*  this is 12 words,                  15 words,        19 words.

tightloopA\<num>
* Handle bit 0.
         IFEQ     (num)&1
         and.w    d1,(a0)
         ELSEIF
         or.w     d0,(a0)
         ENDC
* Handle bit 1.
         IFEQ     (num)&2
         and.w    d1,(a1)
         ELSEIF
         or.w     d0,(a1)
         ENDC
* Handle bit 2.
         IFEQ     (num)&4
         and.w    d1,(a2)
         ELSEIF
         or.w     d0,(a2)
         ENDC
* Handle bit 3.
         IFEQ     (num)&8
         and.w    d1,(a3)
         ELSEIF
         or.w     d0,(a3)
         ENDC

         add.w    d4,a0
         add.w    d4,a1
         add.w    d4,a2
         add.w    d4,a3             ;increase y-coord.

         add.w    d2,d5
         bcc.s    gonextA\<num>     ;Check if x is to be increased.

         IFNE     num-15            ;Clear bit not needed for color15.
         ror.w    #1,d1
         ENDC
         IFNE     num               ;Set bit not needed for color 0.
         ror.w    #1,d0
         bcc.s    gonextA\<num>     ;Check for word-boundary.
         ELSEIF
         bcs.s    gonextA\<num>
         ENDC

         addq.w   #2,a0             ;Need to handle word boundary.
         addq.w   #2,a1
         addq.w   #2,a2
         addq.w   #2,a3
gonextA\<num>
         dbra     d3,tightloopA\<num>
         rts

         ENDM

num      SET      0
         REPT     16
         loopdefA
num      SET      num+1

         ENDR


* Following are the remaining 3 octants.

alwaysright

* Now we have to calculate the 'slant' of the line, ie. what persentage
* of the time do we have to increment the y-coord.
* Now if only i knew which of divu and divs were faster....
* First, however, avoid division by zero.

         tst.w    d2
         beq      singlepoint       ;Branch to draw only a single point.
         swap     d3
         clr.w    d3                ;Need to calculate y*2^16/x.
         divu     d2,d3             ;d2 now contains line-slant.

         jmp      jump_tabC(pc,d7.w);Pick right loop from color.
jump_tabC
* This junk just creates 16 branches to labels tightloopC0..15.
* The most tricky bit is the 'label\<num>' bit. This expands to a label
* with the decimal representation of num at its end, so if num=5, it
* gives 'label5'. With my Devpac 2, this works nicely; hopefully it is
* also supported by other assemblers (else buy Devpac - it's great!)
bradefC  MACRO
         bra      tightloopC\<num>
         ENDM

num      SET      0
         REPT     16
         bradefC
num      SET      num+1
         ENDR

* This creates the 16 loops, one for each color.

loopdefC MACRO
* This is the loop that needs to be optimized real hard.
* It outputs one point with only
*  always  11 instructions, sometimes 14, occasionally 18.
*  this is 12 words,                  15 words,        19 words.
*
* This could be optimized by plotting points in 4 registers instead of
* going directly to memory. This would improve very flat lines, but would
* slow down lines nearer to 45 degrees unless another special case were
* added; I guess I was just too lazy.
tightloopC\<num>
* Handle bit 0.
         IFEQ     (num)&1
         and.w    d1,(a0)
         ELSEIF
         or.w     d0,(a0)
         ENDC
* Handle bit 1.
         IFEQ     (num)&2
         and.w    d1,(a1)
         ELSEIF
         or.w     d0,(a1)
         ENDC
* Handle bit 2.
         IFEQ     (num)&4
         and.w    d1,(a2)
         ELSEIF
         or.w     d0,(a2)
         ENDC
* Handle bit 3.
         IFEQ     (num)&8
         and.w    d1,(a3)
         ELSEIF
         or.w     d0,(a3)
         ENDC

         IFNE     num-15            ;Clear bit not needed for color15.
         ror.w    #1,d1
         ENDC
         IFNE     num               ;Set bit not needed for color 0.
         ror.w    #1,d0
         bcc.s    checkC\<num>      ;Check for word-boundary.
         ELSEIF
         bcs.s    checkC\<num>
         ENDC
         addq.w   #2,a0             ;Need to handle word boundary.
         addq.w   #2,a1
         addq.w   #2,a2
         addq.w   #2,a3

checkC\<num>
         add.w    d3,d5
         bcc.s    gonextC\<num>     ;Check if x is to be increased.

         adda.w   d4,a0
         adda.w   d4,a1
         adda.w   d4,a2
         adda.w   d4,a3             ;increase y-coord.

gonextC\<num>
         dbra     d2,tightloopC\<num>
         rts

         ENDM

num      SET      0
         REPT     16
         loopdefC
num      SET      num+1

         ENDR


lineleft
         neg.w    d2                ;get abs. value of x.
* Initialising some values.
         move.w   #$8000,d0         ;Mask for set pixel.
         move.w   #$7fff,d1         ;Mask for clear pixel.
* (Yes I know - d0 is not needed for color 0 + similar for d1. Feel free to
* optimize further if you use these colors a lot...)

         move.w   d1,d5             ;Store just less than 0.5 for rounding.
         ror.w    d4,d0
         ror.w    d4,d1             ;Get bit in correct position.
;NOTE - Change this for different plane width.
         moveq    #40,d4            ;Width of planes in bytes.

* To draw the line with no 'holes' in it (and no overlap of points either),
* either the x-coord or the y-coord has to be changed by one for each point
* plotted, while the other is only changed occationally.

         cmp.w    d2,d3
         blt      alwaysleft        ;Jump if x is to be changed always.


* Now we have to calculate the 'slant' of the line, ie. what persentage
* of the time do we have to increment the x-coord.
* Now if only i knew which of divu and divs were faster....
* First, however, avoid division by zero.

         tst.w    d3
         beq      singlepoint       ;Branch to draw only a single point.
         cmp.w    d3,d2
         beq.s    deg45B
         swap     d2
         clr.w    d2                ;Need to calculate x*2^16/y.
         divu     d3,d2             ;d2 now contains line-slant.

         jmp      jump_tabB(pc,d7.w);Pick right loop from color.
deg45B
         move.w   #$FFFF,d2
         jmp      jump_tabB(pc,d7.w);Pick right loop from color.

jump_tabB
* This junk just creates 16 branches to labels tightloopA0..15.
* The most tricky bit is the 'label\<num>' bit. This expands to a label
* with the decimal representation of num at its end, so if num=5, it
* gives 'label5'. With my Devpac 2, this works nicely; hopefully it is
* also supported by other assemblers (else buy Devpac - it's great!)
bradefB  MACRO
         bra      tightloopB\<num>
         ENDM

num      SET      0
         REPT     16
         bradefB
num      SET      num+1
         ENDR

* This creates the 16 loops, one for each color.

loopdefB MACRO
* This is the loop that needs to be optimized real hard.
* It outputs one point with only
*  always  11 instructions, sometimes 14, occasionally 18.
*  this is 12 words,                  15 words,        19 words.

tightloopB\<num>
* Handle bit 0.
         IFEQ     (num)&1
         and.w    d1,(a0)
         ELSEIF
         or.w     d0,(a0)
         ENDC
* Handle bit 1.
         IFEQ     (num)&2
         and.w    d1,(a1)
         ELSEIF
         or.w     d0,(a1)
         ENDC
* Handle bit 2.
         IFEQ     (num)&4
         and.w    d1,(a2)
         ELSEIF
         or.w     d0,(a2)
         ENDC
* Handle bit 3.
         IFEQ     (num)&8
         and.w    d1,(a3)
         ELSEIF
         or.w     d0,(a3)
         ENDC

         add.w    d4,a0
         add.w    d4,a1
         add.w    d4,a2
         add.w    d4,a3             ;increase y-coord.

         add.w    d2,d5
         bcc.s    gonextB\<num>     ;Check if x is to be increased.

         IFNE     num-15            ;Clear bit not needed for color15.
         rol.w    #1,d1
         ENDC
         IFNE     num               ;Set bit not needed for color 0.
         rol.w    #1,d0
         bcc.s    gonextB\<num>     ;Check for word-boundary.
         ELSEIF
         bcs.s    gonextB\<num>
         ENDC

         subq.w   #2,a0             ;Need to handle word boundary.
         subq.w   #2,a1
         subq.w   #2,a2
         subq.w   #2,a3
gonextB\<num>
         dbra     d3,tightloopB\<num>
         rts

         ENDM

num      SET      0
         REPT     16
         loopdefB
num      SET      num+1

         ENDR


alwaysleft

* Now we have to calculate the 'slant' of the line, ie. what persentage
* of the time do we have to increment the y-coord.
* Now if only i knew which of divu and divs were faster....
* First, however, avoid division by zero.

         tst.w    d2
         beq      singlepoint       ;Branch to draw only a single point.
         swap     d3
         clr.w    d3                ;Need to calculate y*2^16/x.
         divu     d2,d3             ;d2 now contains line-slant.

         jmp      jump_tabD(pc,d7.w);Pick right loop from color.
jump_tabD
* This junk just creates 16 branches to labels tightloopD0..15.
* The most tricky bit is the 'label\<num>' bit. This expands to a label
* with the decimal representation of num at its end, so if num=5, it
* gives 'label5'. With my Devpac 2, this works nicely; hopefully it is
* also supported by other assemblers (else buy Devpac - it's great!)
bradefD  MACRO
         bra      tightloopD\<num>
         ENDM

num      SET      0
         REPT     16
         bradefD
num      SET      num+1
         ENDR

* This creates the 16 loops, one for each color.

loopdefD MACRO
* This is the loop that needs to be optimized real hard.
* It outputs one point with only
*  always  11 instructions, sometimes 14, occasionally 18.
*  this is 12 words,                  15 words,        19 words.
*
* This could be optimized by plotting points in 4 registers instead of
* going directly to memory. This would improve very flat lines, but would
* slow down lines nearer to 45 degrees unless another special case were
* added; I guess I was just too lazy.
tightloopD\<num>
* Handle bit 0.
         IFEQ     (num)&1
         and.w    d1,(a0)
         ELSEIF
         or.w     d0,(a0)
         ENDC
* Handle bit 1.
         IFEQ     (num)&2
         and.w    d1,(a1)
         ELSEIF
         or.w     d0,(a1)
         ENDC
* Handle bit 2.
         IFEQ     (num)&4
         and.w    d1,(a2)
         ELSEIF
         or.w     d0,(a2)
         ENDC
* Handle bit 3.
         IFEQ     (num)&8
         and.w    d1,(a3)
         ELSEIF
         or.w     d0,(a3)
         ENDC

         IFNE     num-15            ;Clear bit not needed for color15.
         rol.w    #1,d1
         ENDC
         IFNE     num               ;Set bit not needed for color 0.
         rol.w    #1,d0
         bcc.s    checkD\<num>      ;Check for word-boundary.
         ELSEIF
         bcs.s    checkD\<num>
         ENDC
         subq.w   #2,a0             ;Need to handle word boundary.
         subq.w   #2,a1
         subq.w   #2,a2
         subq.w   #2,a3

checkD\<num>
         add.w    d3,d5
         bcc.s    gonextD\<num>     ;Check if x is to be increased.

         adda.w   d4,a0
         adda.w   d4,a1
         adda.w   d4,a2
         adda.w   d4,a3             ;increase y-coord.

gonextD\<num>
         dbra     d2,tightloopD\<num>
         rts

         ENDM

num      SET      0
         REPT     16
         loopdefD
num      SET      num+1

         ENDR


* Our (silly) caller passed us the same point for start and end - just give
* him the single lousy point he wants.
singlepoint
         jmp      jump_tabE(pc,d7.w);Pick right plotrutine from color.
jump_tabE
* This junk just creates 16 branches to labels plotE0..15.
* The most tricky bit is the 'label\<num>' bit. This expands to a label
* with the decimal representation of num at its end, so if num=5, it
* gives 'label5'. With my Devpac 2, this works nicely; hopefully it is
* also supported by other assemblers (else buy Devpac - it's great!)
bradefE  MACRO
         bra      plotE\<num>
         ENDM

num      SET      0
         REPT     16
         bradefE
num      SET      num+1
         ENDR

plotdefE MACRO
* Plot a point in color num.
* Handle bit 0.
plotE\<num>
         IFEQ     (num)&1
         and.w    d1,(a0)
         ELSEIF
         or.w     d0,(a0)
         ENDC
* Handle bit 1.
         IFEQ     (num)&2
         and.w    d1,(a1)
         ELSEIF
         or.w     d0,(a1)
         ENDC
* Handle bit 2.
         IFEQ     (num)&4
         and.w    d1,(a2)
         ELSEIF
         or.w     d0,(a2)
         ENDC
* Handle bit 3.
         IFEQ     (num)&8
         and.w    d1,(a3)
         ELSEIF
         or.w     d0,(a3)
         ENDC

         rts

         ENDM

num      SET      0
         REPT     16
         plotdefE
num      SET      num+1

         ENDR