Moving to Assembly

The sprite drawing functions written in 'C' are just too slow. As an experiment I wrote a draw_sprite function in assembly, that so far only handles the fast case of the y coordinate being a multiple of 8. So far the results are outstanding! In the game 0xDEADBEEF, the alien and cow are always being drawn using the fast case so I was able to switch them over. The new routine saves about 10,000 cycles per frame. With a limit of 50,000 cycles per frame, a 10,000 cycle improvement is huge! The fart utilizes the slow method where lots of shifting needs to happen to get the pixels in the right spot in vmem and is still using the C version. The next step is to rewrite the complex case as well. Below is the fast case written in assembly.

; drawing to #3C00
;void drawSprite(unsigned char* sprite, unsigned char x, unsigned char y, unsigned char w, unsigned char h, unsigned char flip)
_draw_sprite:                  
                  sta tmp7          ; flip

                  jsr popa  
                  sta tmp1          ; h
                  jsr popa
                  sta tmp2          ; w
                  jsr popa
                  sta tmp3          ; y
                  jsr popa
                  sta tmp4          ; x
                  jsr popax
                  sta ptr2          ; sprite
                  stx ptr2+1


                  lda tmp3
                  and #$07          ; calculate %
                  sta tmp5
                  lda tmp3
                  lsr               ; divide by 8
                  lsr
                  lsr
                  sta tmp6          ; Page

                  lda tmp5
                  beq @fast   
                  jmp @complex                

                  ; Fast case
@fast:
                  lda #$00          ; pointer to vmem
                  sta ptr1
                  lda #$3C
                  sta ptr1+1

                  ; Calculate offset into vmem based on page
                  lda tmp6          ; Load page into a
                  jsr multiply_128  ; a/x = page * 128
                  adc ptr1
                  sta ptr1
                  txa
                  adc ptr1+1
                  sta ptr1+1
                  clc

                  lda tmp4          ; add x to ptr1
                  adc ptr1
                  sta ptr1
                  lda #0
                  adc ptr1+1
                  sta ptr1+1

                  ldy tmp2          ; Load width into y, will decrement to 0     
                  dey               ; start at w - 1
@next_stripe:
                  ldx #0
                  lda tmp7
                  bne @transfer_flip

@transfer_next:
                  lda (ptr1), y
                  ora (ptr2), y
                  sta (ptr1), y
                  dey
                  bne @transfer_next
                  lda (ptr1), y      ; Last byte
                  ora (ptr2), y
                  sta (ptr1), y
                  jmp @process_done
@transfer_flip:
                  tya
                  pha
                  txa                  
                  tay
                  inx
                  lda (ptr2), y
                  sta tmp8
                  pla
                  tay
                  lda (ptr1), y
                  ora tmp8
                  sta (ptr1), y  
                  dey                
                  bne @transfer_flip
                  txa               ; Last byte
                  tay
                  lda (ptr2), y
                  sta tmp8
                  ldy #0
                  lda (ptr1), y
                  ora tmp8
                  sta (ptr1), y

@process_done:
                  lda tmp1  
                  sec                
                  sbc #8
                  beq @done         ; If h now down to 0, done
                  sta tmp1

                  clc
                  lda #128          ; Load up next stripe
                  adc ptr1
                  sta ptr1
                  lda #0
                  adc ptr1+1
                  sta ptr1+1

                  clc
                  lda tmp2
                  adc ptr2
                  sta ptr2
                  lda #0
                  adc ptr2+1
                  sta ptr2+1

                  ldy tmp2
                  dey

                  jmp @next_stripe

                  jmp @done
@complex:                   


@done:
                  rts

; multiply 8-bit number in a and store results in a/x as 16-bit
multiply_128:
                  ldy tmp1    ; cache tmp1 in y
                  ldx #0      ; store 0 in tmp1, a already populated
                  stx tmp1
                  asl a       ; 2
                  rol tmp1
                  asl a       ; 4
                  rol tmp1
                  asl a       ; 8
                  rol tmp1
                  asl a       ; 16
                  rol tmp1
                  asl a       ; 32
                  rol tmp1
                  asl a       ; 64
                  rol tmp1
                  asl a       ; 128
                  rol tmp1
                  ldx tmp1            
                  sty tmp1
                  clc
                  rts

0xDEADBEEF

FRAM

Discussions

Become a Hackaday.io Member