I re-wrote the fractal-generation code in assembly to see what gains I could make. Just a simple manual translation of the code came out to 560 bytes (320 instructions), as opposed to the 1019 bytes of compiled C. It produces the same output as the C version. There are places it could be optimized, but I just wanted a baseline written this way. As I often find in PIC assembly, there's a lot of loading and storing to do (you gotta love RISC), and that overhead before each subroutine call is expensive. The PIC16F1718 has a linearly-addressable memory and two auto-incrementing (or decrementing) pointer registers (FSR0/FSR1), and these can be used to automate the parameter passing overhead. I have some prototype code written with a new framework that should streamline numerical code like this, but haven't had a chance to re-implement the fractal code in it yet. I am anxious to see how much it improves things here.
All this is leading up to trying to get the ray tracer in 1kB (I can't let go). I also worked out a way to avoid square roots in testing intersections with spheres, which I'll detail in another log. That alone could save a decent chunk of code.
The naive assembly fractal code is listed here. I know of at least a few places it could be tuned up - not the least of which is inlining the four main calls - that's 8 instructions (14 bytes!) right there. But not terribly interesting ones.
;;;
;;; generate VGA Mandelbrot set in assembly
;;;
#include <p16f1718.inc>
;;RADIX DEC
ERRORLEVEL -302
ERRORLEVEL -305
__CONFIG _CONFIG1, _FOSC_INTOSC & _WDTE_OFF & _PWRTE_ON & _MCLRE_ON & _CP_OFF & _BOREN_ON & _CLKOUTEN_OFF & _FCMEN_ON
__CONFIG _CONFIG2, _WRT_ALL & _PPS1WAY_OFF & _ZCDDIS_ON & _PLLEN_ON & _STVREN_OFF & _BORV_LO & _LPBOR_OFF & _LVP_ON
;;;
;;; h/w interface definition
;;;
#define REG_OE_bar b'00010000'
#define TIMER_en b'00001000'
#define WE_bar b'00000001'
#define WE_bit 0
#define OE_bar b'00000010'
#define CP_en b'00001000'
#define MR_en b'00100000'
#define CP_bar b'00000100'
#define CP_bit 2
#define MR_bar b'00010000'
#define VSYNC b'10000000'
#define HSYNC b'01000000'
#define RGB(r, g, b) (((r & 0x3) << 4) | ((g & 0x3) << 2) | (b & 0x3))
#define H_FRONT_PORCH .16
#define H_SYNC_PULSE .96
#define H_BACK_PORCH .48
#define V_FRONT_PORCH .10
#define V_SYNC_PULSE .2
#define V_BACK_PORCH .33
#define MAXITER .255
#define ESCAPE_RADIUS 0xc0
#define COMPONENT_RADIUS 0xf8
;#define WHOLE_SET
#ifdef WHOLE_SET
#define IMAG_MIN 0xec00
#define REAL_MIN 0xd800
#define IMAG_STEP 0x0015
#define REAL_STEP 0x0015
#else
#define IMAG_MIN 0xff00
#define REAL_MIN 0xe800
#define IMAG_STEP 0x0001
#define REAL_STEP 0x0001
#endif
;;;
;;; variables
;;;
cblock 0x20
;; WRITE_SRAM_BYTES
sram_value
sram_count
;; WRITE_LINE
line_vsync_value
line_rgb_value
line_count
line_loop_count
;; WRITE_FRAME
row_l
row_h
col_l
col_h
;; mandelbrot calculation
iter
a_l
a_h
b_l
b_h
c_l
c_h
d_l
d_h
dc_l
dc_h
dd_l
dd_h
aa_l
aa_h
bb_l
bb_h
aa_plus_bb_l
aa_plus_bb_h
red
green
blue
;; MUL16
mul_a_l
mul_a_h
mul_a_2
mul_a_3
mul_b_l
mul_b_h
mul16_flags
prod_0
prod_1
prod_2
prod_3
;; ABS16
abs_l
abs_h
;; MUL16_SIGNED
mul_sign
;; MUL16_SHIFT
mul_shift_count
endc
;;;
;;; reset vector
;;;
ORG 0
call SETUP_PERIPHERALS
call LOAD_MODE
call WRITE_FRAME
call RUN_MODE
MAIN_LOOP:
bra MAIN_LOOP
SETUP_PERIPHERALS:
;; intosc 32 MHz
BANKSEL OSCCON
movlw b'11110000'
movwf OSCCON
;; select digital I/O
BANKSEL ANSELA
clrf ANSELA
clrf ANSELB
clrf ANSELC
;; set TRIS bits: all outputs
BANKSEL LATA
clrf LATA
clrf LATB
clrf LATC
BANKSEL TRISA
clrf TRISA
clrf TRISB
clrf TRISC
return
LOAD_MODE:
BANKSEL LATA
movlw REG_OE_bar | TIMER_en
movwf LATA
;; toggle CP with MR low to reset address counter
movlw WE_bar | OE_bar | CP_en | CP_bar | MR_en | MR_bar
movwf LATB
movlw WE_bar | OE_bar | CP_en | CP_bar&0 | MR_en | MR_bar
movwf LATB
movlw WE_bar | OE_bar | CP_en | CP_bar | MR_en | MR_bar
movwf LATB
;; bring out of reset
movlw WE_bar | OE_bar | CP_en | CP_bar | MR_en | MR_bar&0 ;
movwf LATB
;; data lines all outputs
BANKSEL TRISC
clrf TRISC
BANKSEL LATA
return
RUN_MODE:
;; data lines all inputs
BANKSEL TRISC
movlw 0xff
movwf TRISC
BANKSEL LATB
;; reset address counter, then let it rip
movlw WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en | MR_bar
movwf LATB
movlw WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en&0 | MR_bar&0 ;
movwf LATB
movlw REG_OE_bar&0 | TIMER_en&0;
movwf LATA
return
;;;
;;; write a series of identical bytes to SRAM
;;;
WRITE_SRAM_BYTES:
movf sram_value, W
movwf LATC
SRAM_LOOP:
;; toggle WE to write value
bcf LATB, WE_bit
bsf LATB, WE_bit
;; toggle CP to increment address
bcf LATB, CP_bit
bsf LATB, CP_bit
decfsz sram_count
bra SRAM_LOOP
return
;;;
;;; write the horizontal sync and porches
;;;
WRITE_HSYNC:
;; write the H. front porch
movf line_vsync_value, W
iorlw HSYNC
movwf sram_value
movlw H_FRONT_PORCH
movwf sram_count
call WRITE_SRAM_BYTES
;; write the H. sync pulse
movf line_vsync_value, W
movwf sram_value
movlw H_SYNC_PULSE
movwf sram_count
call WRITE_SRAM_BYTES
;; write the H. back porch
movf line_vsync_value, W
iorlw HSYNC
movwf sram_value
movlw H_BACK_PORCH
movwf sram_count
call WRITE_SRAM_BYTES
return
;;;
;;; write a series of VGA scanlines to SRAM
;;;
WRITE_SCANLINE:
call WRITE_HSYNC
;; loop 4x writing 160 pixels each time (total 640)
movf line_vsync_value, W
iorlw HSYNC
movwf sram_value
movf line_rgb_value, W
iorwf sram_value
movlw .4
movwf line_loop_count
SCANLINE_LOOP:
movlw .160
movwf sram_count
call WRITE_SRAM_BYTES
decfsz line_loop_count
bra SCANLINE_LOOP
decfsz line_count
bra WRITE_SCANLINE
return
;;;
;;; 16x16->32 unsigned multiply
;;;
MUL16:
clrf prod_0
clrf prod_1
clrf prod_2
clrf prod_3
clrf mul_a_3
clrf mul_a_2
MUL16_LOOP:
btfss mul_b_l, 0
bra MUL16_NO_ADD
movf mul_a_l, W
addwf prod_0
movf mul_a_h, W
addwfc prod_1
movf mul_a_2, W
addwfc prod_2
movf mul_a_3, W
addwfc prod_3
MUL16_NO_ADD:
lslf mul_a_l
rlf mul_a_h
rlf mul_a_2
rlf mul_a_3
lsrf mul_b_h
movf STATUS, W
movwf mul16_flags
rrf mul_b_l
movf mul_b_l, W
movf STATUS, W
andwf mul16_flags
btfss mul16_flags, Z
bra MUL16_LOOP
return
;;;
;;; 16x16->32 signed multiply
;;;
MUL16_SIGNED:
movf mul_a_h, W
movwf mul_sign
btfss mul_a_h, 7
bra MUL16_FLIP_A_DONE
comf mul_a_h
comf mul_a_l
incfsz mul_a_l
bra MUL16_FLIP_A_DONE
incf mul_a_h
MUL16_FLIP_A_DONE:
movf mul_b_h, W
xorwf mul_sign
btfss mul_b_h, 7
bra MUL16_FLIP_B_DONE
comf mul_b_h
comf mul_b_l
incfsz mul_b_l
bra MUL16_FLIP_B_DONE
incf mul_b_h
MUL16_FLIP_B_DONE:
call MUL16
btfss mul_sign, 7
return
comf prod_0
comf prod_1
comf prod_2
comf prod_3
incfsz prod_0
return
incfsz prod_1
return
incfsz prod_2
return
incf prod_3
return
;;; shift right 8+W bits
MUL16_SHIFT:
movwf mul_shift_count
movf prod_1, W
movwf prod_0
movf prod_2, W
movwf prod_1
movf prod_3, W
movwf prod_2
MUL16_SHIFT_LOOP:
asrf prod_2
rrf prod_1
rrf prod_0
decfsz mul_shift_count
bra MUL16_SHIFT_LOOP
return
;;;
;;; calculate escape iteration for complex point c + di
;;; under: Z <- Z^2 + c + di
;;; Z = a + bi
MANDELBROT_ITERATION:
clrf a_l
clrf a_h
clrf b_l
clrf b_h
movlw MAXITER
movwf iter
MANDELBROT_LOOP:
;; aa = a * a
movf a_l, W
movwf mul_a_l
movwf mul_b_l
movf a_h, W
movwf mul_a_h
movwf mul_b_h
call MUL16_SIGNED
;; test aa for escape
movf prod_3, W
andlw COMPONENT_RADIUS
btfss STATUS, Z
return
;; shift aa 12 bits post-mul
movlw 4
call MUL16_SHIFT
movf prod_0, W
movwf aa_l
movf prod_1, W
movwf aa_h
;; bb = b * b
movf b_l, W
movwf mul_a_l
movwf mul_b_l
movf b_h, W
movwf mul_a_h
movwf mul_b_h
call MUL16_SIGNED
;; test bb for escape
movf prod_3, W
andlw COMPONENT_RADIUS
btfss STATUS, Z
return
;; shift bb 12 bits post-mul
movlw 4
call MUL16_SHIFT
movf prod_0, W
movwf bb_l
movf prod_1, W
movwf bb_h
;; b = 2 * a * b + d
movf a_l, W
movwf mul_a_l
movf a_h, W
movwf mul_a_h
movf b_l, W
movwf mul_b_l
movf b_h, W
movwf mul_b_h
call MUL16_SIGNED
;; shift only 11 bits to effectively multiply by 2
movlw 3
call MUL16_SHIFT
movf d_l, W
addwf prod_0, W
movwf b_l
movf d_h, W
addwfc prod_1, W
movwf b_h
;; a = aa - bb + c
movf aa_l, W
movwf a_l
movf bb_l, W
subwf a_l
movf aa_h, W
movwf a_h
movf bb_h, W
subwfb a_h
movf c_l, W
addwf a_l
movf c_h, W
addwfc a_h
;; test aa + bb for escape
movf bb_l, W
addwf aa_l
movf bb_h, W
addwfc aa_h, W
andlw ESCAPE_RADIUS
btfss STATUS, Z
return
decfsz iter
bra MANDELBROT_LOOP
return
;;;
;;; write the VGA frame to SRAM
;;;
WRITE_FRAME:
;; write V. back porch
clrf line_rgb_value
movlw VSYNC
movwf line_vsync_value
movlw V_BACK_PORCH
movwf line_count
call WRITE_SCANLINE
#define COUNTER16H(x) (HIGH(x)+1)
#define COUNTER16L(x) LOW(x)
;; init d
movlw LOW(IMAG_MIN)
movwf d_l
movlw HIGH(IMAG_MIN)
movwf d_h
movlw COUNTER16H(.480)
movwf row_h
movlw COUNTER16L(.480)
movwf row_l
ROW_LOOP:
movlw VSYNC
movwf line_vsync_value
call WRITE_HSYNC
;; init c
movlw LOW(REAL_MIN)
movwf c_l
movlw HIGH(REAL_MIN)
movwf c_h
movlw COUNTER16H(.640)
movwf col_h
movlw COUNTER16L(.640)
movwf col_l
COL_LOOP:
call MANDELBROT_ITERATION
movlw VSYNC | HSYNC
movwf sram_value
movlw 0x3f
andwf iter
movf iter, W
andlw 0x0c
swapf iter
iorwf iter, W
andlw 0x3f
iorwf sram_value
movlw .1
movwf sram_count
call WRITE_SRAM_BYTES
;; increment c
movlw LOW(REAL_STEP)
addwf c_l
movlw HIGH(REAL_STEP)
addwfc c_h
;; end of col loop
decfsz col_l
bra COL_LOOP
decfsz col_h
bra COL_LOOP
;; increment d
movlw LOW(IMAG_STEP)
addwf d_l
movlw HIGH(IMAG_STEP)
addwfc d_h
;; end of row loop
decfsz row_l
bra ROW_LOOP
decfsz row_h
bra ROW_LOOP
;; write V. front porch
clrf line_rgb_value
movlw VSYNC
movwf line_vsync_value
movlw V_FRONT_PORCH
movwf line_count
call WRITE_SCANLINE
;; write V. sync pulse
clrf line_vsync_value
movlw V_SYNC_PULSE
movwf line_count
call WRITE_SCANLINE
;; write two bytes of back porch to reset address counter
movlw VSYNC | HSYNC
movwf sram_value
movlw .2
movwf sram_count
call WRITE_SRAM_BYTES
return
END
Discussions
Become a Hackaday.io Member
Create an account to leave a comment. Already have an account? Log In.