Fractals re-written in assembly: 560 bytes

I re-wrote the fractal-generation code in assembly to see what gains I could make. Just a simple manual translation of the code came out to 560 bytes (320 instructions), as opposed to the 1019 bytes of compiled C. It produces the same output as the C version. There are places it could be optimized, but I just wanted a baseline written this way. As I often find in PIC assembly, there's a lot of loading and storing to do (you gotta love RISC), and that overhead before each subroutine call is expensive. The PIC16F1718 has a linearly-addressable memory and two auto-incrementing (or decrementing) pointer registers (FSR0/FSR1), and these can be used to automate the parameter passing overhead. I have some prototype code written with a new framework that should streamline numerical code like this, but haven't had a chance to re-implement the fractal code in it yet. I am anxious to see how much it improves things here.

All this is leading up to trying to get the ray tracer in 1kB (I can't let go). I also worked out a way to avoid square roots in testing intersections with spheres, which I'll detail in another log. That alone could save a decent chunk of code.

The naive assembly fractal code is listed here. I know of at least a few places it could be tuned up - not the least of which is inlining the four main calls - that's 8 instructions (14 bytes!) right there. But not terribly interesting ones.

;;;
;;; generate VGA Mandelbrot set in assembly
;;;
#include <p16f1718.inc>

  ;;RADIX           DEC
  ERRORLEVEL     -302
  ERRORLEVEL     -305  
  
  __CONFIG  _CONFIG1,  _FOSC_INTOSC & _WDTE_OFF & _PWRTE_ON & _MCLRE_ON & _CP_OFF & _BOREN_ON & _CLKOUTEN_OFF & _FCMEN_ON
  __CONFIG  _CONFIG2,  _WRT_ALL & _PPS1WAY_OFF & _ZCDDIS_ON & _PLLEN_ON & _STVREN_OFF & _BORV_LO & _LPBOR_OFF & _LVP_ON

;;;
;;;  h/w interface definition
;;;
#define REG_OE_bar  b'00010000'
#define TIMER_en    b'00001000'
#define WE_bar      b'00000001'
#define WE_bit      0
#define OE_bar      b'00000010'
#define CP_en       b'00001000'
#define MR_en       b'00100000'
#define CP_bar      b'00000100'
#define CP_bit      2  
#define MR_bar      b'00010000'

#define VSYNC       b'10000000'
#define HSYNC       b'01000000'
#define RGB(r, g, b) (((r & 0x3) << 4) | ((g & 0x3) << 2) | (b & 0x3))

#define H_FRONT_PORCH   .16
#define H_SYNC_PULSE    .96
#define H_BACK_PORCH    .48

#define V_FRONT_PORCH   .10
#define V_SYNC_PULSE    .2
#define V_BACK_PORCH    .33  

#define MAXITER         .255
#define ESCAPE_RADIUS   0xc0
#define COMPONENT_RADIUS 0xf8

;#define WHOLE_SET  
#ifdef WHOLE_SET  
#define IMAG_MIN        0xec00
#define REAL_MIN        0xd800
#define IMAG_STEP       0x0015
#define REAL_STEP       0x0015
#else
#define IMAG_MIN        0xff00
#define REAL_MIN        0xe800
#define IMAG_STEP       0x0001
#define REAL_STEP       0x0001  
#endif  
  
;;;
;;; variables
;;; 
  cblock  0x20
    ;; WRITE_SRAM_BYTES
    sram_value
    sram_count
    ;; WRITE_LINE
    line_vsync_value
    line_rgb_value
    line_count
    line_loop_count
    ;; WRITE_FRAME
    row_l
    row_h
    col_l
    col_h
    ;; mandelbrot calculation
    iter
    a_l
    a_h
    b_l
    b_h
    c_l
    c_h
    d_l
    d_h
    dc_l
    dc_h
    dd_l
    dd_h
    aa_l
    aa_h
    bb_l
    bb_h
    aa_plus_bb_l
    aa_plus_bb_h
    red
    green
    blue
    ;; MUL16
    mul_a_l
    mul_a_h
    mul_a_2
    mul_a_3
    mul_b_l
    mul_b_h
    mul16_flags
    prod_0  
    prod_1  
    prod_2  
    prod_3
    ;; ABS16
    abs_l
    abs_h
    ;; MUL16_SIGNED
    mul_sign
    ;; MUL16_SHIFT
    mul_shift_count
  endc

;;;
;;; reset vector
;;; 
    ORG         0
    call        SETUP_PERIPHERALS
    call        LOAD_MODE
    call        WRITE_FRAME
    call        RUN_MODE
MAIN_LOOP:
    bra         MAIN_LOOP

SETUP_PERIPHERALS:
    ;; intosc 32 MHz
    BANKSEL     OSCCON
    movlw       b'11110000'
    movwf       OSCCON
    ;; select digital I/O
    BANKSEL     ANSELA
    clrf        ANSELA
    clrf        ANSELB
    clrf        ANSELC    
    ;; set TRIS bits: all outputs
    BANKSEL     LATA
    clrf        LATA
    clrf        LATB
    clrf        LATC
    BANKSEL     TRISA
    clrf        TRISA
    clrf        TRISB
    clrf        TRISC    
    return
    
LOAD_MODE:
    BANKSEL     LATA
    movlw       REG_OE_bar | TIMER_en
    movwf       LATA
    ;; toggle CP with MR low to reset address counter
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar
    movwf       LATB
    movlw       WE_bar | OE_bar | CP_en | CP_bar&0 | MR_en | MR_bar
    movwf       LATB    
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar
    movwf       LATB        
    ;; bring out of reset
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
    movwf       LATB
    ;; data lines all outputs
    BANKSEL     TRISC
    clrf        TRISC
    BANKSEL     LATA
    return

RUN_MODE:
    ;; data lines all inputs
    BANKSEL     TRISC
    movlw       0xff
    movwf       TRISC
    BANKSEL     LATB
    ;; reset address counter, then let it rip
    movlw       WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en   | MR_bar
    movwf       LATB
    movlw       WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en&0 | MR_bar&0 ;
    movwf       LATB
    movlw       REG_OE_bar&0 | TIMER_en&0;
    movwf       LATA
    return
    
;;;
;;; write a series of identical bytes to SRAM
;;; 
WRITE_SRAM_BYTES:
    movf        sram_value, W
    movwf       LATC
SRAM_LOOP:
    ;; toggle WE to write value
    bcf         LATB, WE_bit
    bsf         LATB, WE_bit
    ;; toggle CP to increment address
    bcf         LATB, CP_bit
    bsf         LATB, CP_bit
    decfsz      sram_count
    bra         SRAM_LOOP
    return

;;;
;;; write the horizontal sync and porches
;;; 
WRITE_HSYNC:  
    ;; write the H. front porch
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movlw       H_FRONT_PORCH  
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    ;; write the H. sync pulse   
    movf        line_vsync_value, W
    movwf       sram_value
    movlw       H_SYNC_PULSE
    movwf       sram_count
    call        WRITE_SRAM_BYTES  
    ;; write the H. back porch
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movlw       H_BACK_PORCH  
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    return
;;;
;;; write a series of VGA scanlines to SRAM
;;;
WRITE_SCANLINE:
    call        WRITE_HSYNC
  
    ;; loop 4x writing 160 pixels each time (total 640)
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movf        line_rgb_value, W
    iorwf       sram_value
    movlw       .4
    movwf       line_loop_count
SCANLINE_LOOP:
    movlw       .160
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    decfsz      line_loop_count
    bra         SCANLINE_LOOP
    decfsz      line_count
    bra         WRITE_SCANLINE
    return

;;;
;;; 16x16->32 unsigned multiply
;;; 
MUL16:
    clrf        prod_0
    clrf        prod_1
    clrf        prod_2
    clrf        prod_3
    clrf        mul_a_3
    clrf        mul_a_2  
MUL16_LOOP: 
    btfss       mul_b_l, 0
    bra         MUL16_NO_ADD
    movf        mul_a_l, W
    addwf       prod_0
    movf        mul_a_h, W
    addwfc      prod_1
    movf        mul_a_2, W
    addwfc      prod_2
    movf        mul_a_3, W
    addwfc      prod_3
MUL16_NO_ADD:
    lslf        mul_a_l
    rlf         mul_a_h
    rlf         mul_a_2
    rlf         mul_a_3
    lsrf        mul_b_h
    movf        STATUS, W
    movwf       mul16_flags
    rrf         mul_b_l
    movf        mul_b_l, W
    movf        STATUS, W  
    andwf       mul16_flags
    btfss       mul16_flags, Z
    bra         MUL16_LOOP
    return

;;;
;;; 16x16->32 signed multiply
;;; 
MUL16_SIGNED:
    movf        mul_a_h, W
    movwf       mul_sign
    btfss       mul_a_h, 7
    bra         MUL16_FLIP_A_DONE
    comf        mul_a_h
    comf        mul_a_l
    incfsz      mul_a_l
    bra         MUL16_FLIP_A_DONE    
    incf        mul_a_h
MUL16_FLIP_A_DONE: 
    movf        mul_b_h, W
    xorwf       mul_sign
    btfss       mul_b_h, 7
    bra         MUL16_FLIP_B_DONE
    comf        mul_b_h
    comf        mul_b_l
    incfsz      mul_b_l
    bra         MUL16_FLIP_B_DONE    
    incf        mul_b_h
MUL16_FLIP_B_DONE: 
    call        MUL16
    btfss       mul_sign, 7
    return
    comf        prod_0
    comf        prod_1 
    comf        prod_2
    comf        prod_3
    incfsz      prod_0
    return
    incfsz      prod_1
    return
    incfsz      prod_2
    return
    incf        prod_3
    return

;;; shift right 8+W bits
MUL16_SHIFT:
    movwf       mul_shift_count
    movf        prod_1, W
    movwf       prod_0
    movf        prod_2, W
    movwf       prod_1
    movf        prod_3, W
    movwf       prod_2
MUL16_SHIFT_LOOP:
    asrf        prod_2
    rrf         prod_1
    rrf         prod_0
    decfsz      mul_shift_count
    bra         MUL16_SHIFT_LOOP     
    return

;;;
;;; calculate escape iteration for complex point c + di
;;;   under: Z <- Z^2 + c + di
;;;   Z = a + bi
MANDELBROT_ITERATION:
    clrf        a_l
    clrf        a_h
    clrf        b_l
    clrf        b_h  
    movlw       MAXITER
    movwf       iter  
MANDELBROT_LOOP:  
    ;; aa = a * a
    movf        a_l, W
    movwf       mul_a_l
    movwf       mul_b_l  
    movf        a_h, W
    movwf       mul_a_h
    movwf       mul_b_h    
    call        MUL16_SIGNED
    ;; test aa for escape
    movf        prod_3, W  
    andlw       COMPONENT_RADIUS
    btfss       STATUS, Z
    return
    ;; shift aa 12 bits post-mul
    movlw       4
    call        MUL16_SHIFT
    movf        prod_0, W
    movwf       aa_l
    movf        prod_1, W
    movwf       aa_h  
    
    ;; bb = b * b
    movf        b_l, W
    movwf       mul_a_l
    movwf       mul_b_l  
    movf        b_h, W
    movwf       mul_a_h
    movwf       mul_b_h    
    call        MUL16_SIGNED  
    ;; test bb for escape
    movf        prod_3, W  
    andlw       COMPONENT_RADIUS
    btfss       STATUS, Z
    return
    ;; shift bb 12 bits post-mul
    movlw       4
    call        MUL16_SHIFT
    movf        prod_0, W
    movwf       bb_l
    movf        prod_1, W
    movwf       bb_h  

    ;; b = 2 * a * b + d
    movf        a_l, W
    movwf       mul_a_l
    movf        a_h, W
    movwf       mul_a_h
    movf        b_l, W
    movwf       mul_b_l
    movf        b_h, W
    movwf       mul_b_h
    call        MUL16_SIGNED
    ;; shift only 11 bits to effectively multiply by 2
    movlw       3
    call        MUL16_SHIFT
    movf        d_l, W
    addwf       prod_0, W
    movwf       b_l
    movf        d_h, W
    addwfc      prod_1, W
    movwf       b_h
  
    ;; a = aa - bb + c
    movf        aa_l, W
    movwf       a_l
    movf        bb_l, W
    subwf       a_l
    movf        aa_h, W
    movwf       a_h
    movf        bb_h, W
    subwfb      a_h
    movf        c_l, W
    addwf       a_l
    movf        c_h, W
    addwfc      a_h

    ;; test aa + bb for escape
    movf        bb_l, W
    addwf       aa_l
    movf        bb_h, W  
    addwfc      aa_h, W
    andlw       ESCAPE_RADIUS
    btfss       STATUS, Z
    return
  
    decfsz      iter
    bra         MANDELBROT_LOOP
    return
  
;;;
;;;  write the VGA frame to SRAM
;;; 
WRITE_FRAME:
    ;; write V. back porch
    clrf        line_rgb_value
    movlw       VSYNC
    movwf       line_vsync_value
    movlw       V_BACK_PORCH
    movwf       line_count
    call        WRITE_SCANLINE

 #define COUNTER16H(x) (HIGH(x)+1)
 #define COUNTER16L(x) LOW(x)

    ;; init d
    movlw       LOW(IMAG_MIN)
    movwf       d_l  
    movlw       HIGH(IMAG_MIN)
    movwf       d_h
  
    movlw       COUNTER16H(.480)
    movwf       row_h
    movlw       COUNTER16L(.480)
    movwf       row_l
ROW_LOOP:

    movlw       VSYNC
    movwf       line_vsync_value
    call        WRITE_HSYNC
  
    ;; init c
    movlw       LOW(REAL_MIN)
    movwf       c_l  
    movlw       HIGH(REAL_MIN)
    movwf       c_h
  
    movlw       COUNTER16H(.640)
    movwf       col_h
    movlw       COUNTER16L(.640)
    movwf       col_l
COL_LOOP:

    call        MANDELBROT_ITERATION
    movlw       VSYNC | HSYNC
    movwf       sram_value
    movlw       0x3f
    andwf       iter
    movf        iter, W
    andlw       0x0c
    swapf       iter
    iorwf       iter, W
    andlw       0x3f    
    iorwf       sram_value    
    movlw       .1
    movwf       sram_count
    call        WRITE_SRAM_BYTES

    ;;  increment c
    movlw       LOW(REAL_STEP)
    addwf       c_l
    movlw       HIGH(REAL_STEP)
    addwfc      c_h  

    ;; end of col loop
    decfsz      col_l
    bra         COL_LOOP
    decfsz      col_h
    bra         COL_LOOP

    ;;  increment d
    movlw       LOW(IMAG_STEP)
    addwf       d_l
    movlw       HIGH(IMAG_STEP)
    addwfc      d_h  

    ;; end of row loop 
    decfsz      row_l
    bra         ROW_LOOP
    decfsz      row_h
    bra         ROW_LOOP
  
    ;; write V. front porch
    clrf        line_rgb_value
    movlw       VSYNC
    movwf       line_vsync_value
    movlw       V_FRONT_PORCH
    movwf       line_count
    call        WRITE_SCANLINE           
    ;; write V. sync pulse
    clrf        line_vsync_value
    movlw       V_SYNC_PULSE
    movwf       line_count
    call        WRITE_SCANLINE
    ;; write two bytes of back porch to reset address counter
    movlw       VSYNC | HSYNC
    movwf       sram_value
    movlw       .2
    movwf       sram_count
    call        WRITE_SRAM_BYTES    
    return

    END

640x480 RLE Wrencher (4409 Bytes)

Discussions

Become a Hackaday.io Member