« Back to project details Sort by:

Fractals re-written in assembly: 560 bytes

01/17/2017 at 19:41 • 0 comments

I re-wrote the fractal-generation code in assembly to see what gains I could make. Just a simple manual translation of the code came out to 560 bytes (320 instructions), as opposed to the 1019 bytes of compiled C. It produces the same output as the C version. There are places it could be optimized, but I just wanted a baseline written this way. As I often find in PIC assembly, there's a lot of loading and storing to do (you gotta love RISC), and that overhead before each subroutine call is expensive. The PIC16F1718 has a linearly-addressable memory and two auto-incrementing (or decrementing) pointer registers (FSR0/FSR1), and these can be used to automate the parameter passing overhead. I have some prototype code written with a new framework that should streamline numerical code like this, but haven't had a chance to re-implement the fractal code in it yet. I am anxious to see how much it improves things here.

All this is leading up to trying to get the ray tracer in 1kB (I can't let go). I also worked out a way to avoid square roots in testing intersections with spheres, which I'll detail in another log. That alone could save a decent chunk of code.

The naive assembly fractal code is listed here. I know of at least a few places it could be tuned up - not the least of which is inlining the four main calls - that's 8 instructions (14 bytes!) right there. But not terribly interesting ones.

;;;
;;; generate VGA Mandelbrot set in assembly
;;;
#include <p16f1718.inc>

  ;;RADIX           DEC
  ERRORLEVEL     -302
  ERRORLEVEL     -305  
  
  __CONFIG  _CONFIG1,  _FOSC_INTOSC & _WDTE_OFF & _PWRTE_ON & _MCLRE_ON & _CP_OFF & _BOREN_ON & _CLKOUTEN_OFF & _FCMEN_ON
  __CONFIG  _CONFIG2,  _WRT_ALL & _PPS1WAY_OFF & _ZCDDIS_ON & _PLLEN_ON & _STVREN_OFF & _BORV_LO & _LPBOR_OFF & _LVP_ON

;;;
;;;  h/w interface definition
;;;
#define REG_OE_bar  b'00010000'
#define TIMER_en    b'00001000'
#define WE_bar      b'00000001'
#define WE_bit      0
#define OE_bar      b'00000010'
#define CP_en       b'00001000'
#define MR_en       b'00100000'
#define CP_bar      b'00000100'
#define CP_bit      2  
#define MR_bar      b'00010000'

#define VSYNC       b'10000000'
#define HSYNC       b'01000000'
#define RGB(r, g, b) (((r & 0x3) << 4) | ((g & 0x3) << 2) | (b & 0x3))

#define H_FRONT_PORCH   .16
#define H_SYNC_PULSE    .96
#define H_BACK_PORCH    .48

#define V_FRONT_PORCH   .10
#define V_SYNC_PULSE    .2
#define V_BACK_PORCH    .33  

#define MAXITER         .255
#define ESCAPE_RADIUS   0xc0
#define COMPONENT_RADIUS 0xf8

;#define WHOLE_SET  
#ifdef WHOLE_SET  
#define IMAG_MIN        0xec00
#define REAL_MIN        0xd800
#define IMAG_STEP       0x0015
#define REAL_STEP       0x0015
#else
#define IMAG_MIN        0xff00
#define REAL_MIN        0xe800
#define IMAG_STEP       0x0001
#define REAL_STEP       0x0001  
#endif  
  
;;;
;;; variables
;;; 
  cblock  0x20
    ;; WRITE_SRAM_BYTES
    sram_value
    sram_count
    ;; WRITE_LINE
    line_vsync_value
    line_rgb_value
    line_count
    line_loop_count
    ;; WRITE_FRAME
    row_l
    row_h
    col_l
    col_h
    ;; mandelbrot calculation
    iter
    a_l
    a_h
    b_l
    b_h
    c_l
    c_h
    d_l
    d_h
    dc_l
    dc_h
    dd_l
    dd_h
    aa_l
    aa_h
    bb_l
    bb_h
    aa_plus_bb_l
    aa_plus_bb_h
    red
    green
    blue
    ;; MUL16
    mul_a_l
    mul_a_h
    mul_a_2
    mul_a_3
    mul_b_l
    mul_b_h
    mul16_flags
    prod_0  
    prod_1  
    prod_2  
    prod_3
    ;; ABS16
    abs_l
    abs_h
    ;; MUL16_SIGNED
    mul_sign
    ;; MUL16_SHIFT
    mul_shift_count
  endc

;;;
;;; reset vector
;;; 
    ORG         0
    call        SETUP_PERIPHERALS
    call        LOAD_MODE
    call        WRITE_FRAME
    call        RUN_MODE
MAIN_LOOP:
    bra         MAIN_LOOP

SETUP_PERIPHERALS:
    ;; intosc 32 MHz
    BANKSEL     OSCCON
    movlw       b'11110000'
    movwf       OSCCON
    ;; select digital I/O
    BANKSEL     ANSELA
    clrf        ANSELA
    clrf        ANSELB
    clrf        ANSELC    
    ;; set TRIS bits: all outputs
    BANKSEL     LATA
    clrf        LATA
    clrf        LATB
    clrf        LATC
    BANKSEL     TRISA
    clrf        TRISA
    clrf        TRISB
    clrf        TRISC    
    return
    
LOAD_MODE:
    BANKSEL     LATA
    movlw       REG_OE_bar | TIMER_en
    movwf       LATA
    ;; toggle CP with MR low to reset address counter
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar
    movwf       LATB
    movlw       WE_bar | OE_bar | CP_en | CP_bar&0 | MR_en | MR_bar
    movwf       LATB    
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar
    movwf       LATB        
    ;; bring out of reset
    movlw       WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
    movwf       LATB
    ;; data lines all outputs
    BANKSEL     TRISC
    clrf        TRISC
    BANKSEL     LATA
    return

RUN_MODE:
    ;; data lines all inputs
    BANKSEL     TRISC
    movlw       0xff
    movwf       TRISC
    BANKSEL     LATB
    ;; reset address counter, then let it rip
    movlw       WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en   | MR_bar
    movwf       LATB
    movlw       WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en&0 | MR_bar&0 ;
    movwf       LATB
    movlw       REG_OE_bar&0 | TIMER_en&0;
    movwf       LATA
    return
    
;;;
;;; write a series of identical bytes to SRAM
;;; 
WRITE_SRAM_BYTES:
    movf        sram_value, W
    movwf       LATC
SRAM_LOOP:
    ;; toggle WE to write value
    bcf         LATB, WE_bit
    bsf         LATB, WE_bit
    ;; toggle CP to increment address
    bcf         LATB, CP_bit
    bsf         LATB, CP_bit
    decfsz      sram_count
    bra         SRAM_LOOP
    return

;;;
;;; write the horizontal sync and porches
;;; 
WRITE_HSYNC:  
    ;; write the H. front porch
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movlw       H_FRONT_PORCH  
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    ;; write the H. sync pulse   
    movf        line_vsync_value, W
    movwf       sram_value
    movlw       H_SYNC_PULSE
    movwf       sram_count
    call        WRITE_SRAM_BYTES  
    ;; write the H. back porch
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movlw       H_BACK_PORCH  
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    return
;;;
;;; write a series of VGA scanlines to SRAM
;;;
WRITE_SCANLINE:
    call        WRITE_HSYNC
  
    ;; loop 4x writing 160 pixels each time (total 640)
    movf        line_vsync_value, W
    iorlw       HSYNC
    movwf       sram_value
    movf        line_rgb_value, W
    iorwf       sram_value
    movlw       .4
    movwf       line_loop_count
SCANLINE_LOOP:
    movlw       .160
    movwf       sram_count
    call        WRITE_SRAM_BYTES
    decfsz      line_loop_count
    bra         SCANLINE_LOOP
    decfsz      line_count
    bra         WRITE_SCANLINE
    return

;;;
;;; 16x16->32 unsigned multiply
;;; 
MUL16:
    clrf        prod_0
    clrf        prod_1
    clrf        prod_2
    clrf        prod_3
    clrf        mul_a_3
    clrf        mul_a_2  
MUL16_LOOP: 
    btfss       mul_b_l, 0
    bra         MUL16_NO_ADD
    movf        mul_a_l, W
    addwf       prod_0
    movf        mul_a_h, W
    addwfc      prod_1
    movf        mul_a_2, W
    addwfc      prod_2
    movf        mul_a_3, W
    addwfc      prod_3
MUL16_NO_ADD:
    lslf        mul_a_l
    rlf         mul_a_h
    rlf         mul_a_2
    rlf         mul_a_3
    lsrf        mul_b_h
    movf        STATUS, W
    movwf       mul16_flags
    rrf         mul_b_l
    movf        mul_b_l, W
    movf        STATUS, W  
    andwf       mul16_flags
    btfss       mul16_flags, Z
    bra         MUL16_LOOP
    return

;;;
;;; 16x16->32 signed multiply
;;; 
MUL16_SIGNED:
    movf        mul_a_h, W
    movwf       mul_sign
    btfss       mul_a_h, 7
    bra         MUL16_FLIP_A_DONE
    comf        mul_a_h
    comf        mul_a_l
    incfsz      mul_a_l
    bra         MUL16_FLIP_A_DONE    
    incf        mul_a_h
MUL16_FLIP_A_DONE: 
    movf        mul_b_h, W
    xorwf       mul_sign
    btfss       mul_b_h, 7
    bra         MUL16_FLIP_B_DONE
    comf        mul_b_h
    comf        mul_b_l
    incfsz      mul_b_l
    bra         MUL16_FLIP_B_DONE    
    incf        mul_b_h
MUL16_FLIP_B_DONE: 
    call        MUL16
    btfss       mul_sign, 7
    return
    comf        prod_0
    comf        prod_1 
    comf        prod_2
    comf        prod_3
    incfsz      prod_0
    return
    incfsz      prod_1
    return
    incfsz      prod_2
    return
    incf        prod_3
    return

;;; shift right 8+W bits
MUL16_SHIFT:
    movwf       mul_shift_count
    movf        prod_1, W
    movwf       prod_0
    movf        prod_2, W
    movwf       prod_1
    movf        prod_3, W
    movwf       prod_2
MUL16_SHIFT_LOOP:
    asrf        prod_2
    rrf         prod_1
    rrf         prod_0
    decfsz      mul_shift_count
    bra         MUL16_SHIFT_LOOP     
    return

;;;
;;; calculate escape iteration for complex point c + di
;;;   under: Z <- Z^2 + c + di
;;;   Z = a + bi
MANDELBROT_ITERATION:
    clrf        a_l
    clrf        a_h
    clrf        b_l
    clrf        b_h  
    movlw       MAXITER
    movwf       iter  
MANDELBROT_LOOP:  
    ;; aa = a * a
    movf        a_l, W
    movwf       mul_a_l
    movwf       mul_b_l  
    movf        a_h, W
    movwf       mul_a_h
    movwf       mul_b_h    
    call        MUL16_SIGNED
    ;; test aa for escape
    movf        prod_3, W  
    andlw       COMPONENT_RADIUS
    btfss       STATUS, Z
    return
    ;; shift aa 12 bits post-mul
    movlw       4
    call        MUL16_SHIFT
    movf        prod_0, W
    movwf       aa_l
    movf        prod_1, W
    movwf       aa_h  
    
    ;; bb = b * b
    movf        b_l, W
    movwf       mul_a_l
    movwf       mul_b_l  
    movf        b_h, W
    movwf       mul_a_h
    movwf       mul_b_h    
    call        MUL16_SIGNED  
    ;; test bb for escape
    movf        prod_3, W  
    andlw       COMPONENT_RADIUS
    btfss       STATUS, Z
    return
    ;; shift bb 12 bits post-mul
    movlw       4
    call        MUL16_SHIFT
    movf        prod_0, W
    movwf       bb_l
    movf        prod_1, W
    movwf       bb_h  

    ;; b = 2 * a * b + d
    movf        a_l, W
    movwf       mul_a_l
    movf        a_h, W
    movwf       mul_a_h
    movf        b_l, W
    movwf       mul_b_l
    movf        b_h, W
    movwf       mul_b_h
    call        MUL16_SIGNED
    ;; shift only 11 bits to effectively multiply by 2
    movlw       3
    call        MUL16_SHIFT
    movf        d_l, W
    addwf       prod_0, W
    movwf       b_l
    movf        d_h, W
    addwfc      prod_1, W
    movwf       b_h
  
    ;; a = aa - bb + c
    movf        aa_l, W
    movwf       a_l
    movf        bb_l, W
    subwf       a_l
    movf        aa_h, W
    movwf       a_h
    movf        bb_h, W
    subwfb      a_h
    movf        c_l, W
    addwf       a_l
    movf        c_h, W
    addwfc      a_h

    ;; test aa + bb for escape
    movf        bb_l, W
    addwf       aa_l
    movf        bb_h, W  
    addwfc      aa_h, W
    andlw       ESCAPE_RADIUS
    btfss       STATUS, Z
    return
  
    decfsz      iter
    bra         MANDELBROT_LOOP
    return
  
;;;
;;;  write the VGA frame to SRAM
;;; 
WRITE_FRAME:
    ;; write V. back porch
    clrf        line_rgb_value
    movlw       VSYNC
    movwf       line_vsync_value
    movlw       V_BACK_PORCH
    movwf       line_count
    call        WRITE_SCANLINE

 #define COUNTER16H(x) (HIGH(x)+1)
 #define COUNTER16L(x) LOW(x)

    ;; init d
    movlw       LOW(IMAG_MIN)
    movwf       d_l  
    movlw       HIGH(IMAG_MIN)
    movwf       d_h
  
    movlw       COUNTER16H(.480)
    movwf       row_h
    movlw       COUNTER16L(.480)
    movwf       row_l
ROW_LOOP:

    movlw       VSYNC
    movwf       line_vsync_value
    call        WRITE_HSYNC
  
    ;; init c
    movlw       LOW(REAL_MIN)
    movwf       c_l  
    movlw       HIGH(REAL_MIN)
    movwf       c_h
  
    movlw       COUNTER16H(.640)
    movwf       col_h
    movlw       COUNTER16L(.640)
    movwf       col_l
COL_LOOP:

    call        MANDELBROT_ITERATION
    movlw       VSYNC | HSYNC
    movwf       sram_value
    movlw       0x3f
    andwf       iter
    movf        iter, W
    andlw       0x0c
    swapf       iter
    iorwf       iter, W
    andlw       0x3f    
    iorwf       sram_value    
    movlw       .1
    movwf       sram_count
    call        WRITE_SRAM_BYTES

    ;;  increment c
    movlw       LOW(REAL_STEP)
    addwf       c_l
    movlw       HIGH(REAL_STEP)
    addwfc      c_h  

    ;; end of col loop
    decfsz      col_l
    bra         COL_LOOP
    decfsz      col_h
    bra         COL_LOOP

    ;;  increment d
    movlw       LOW(IMAG_STEP)
    addwf       d_l
    movlw       HIGH(IMAG_STEP)
    addwfc      d_h  

    ;; end of row loop 
    decfsz      row_l
    bra         ROW_LOOP
    decfsz      row_h
    bra         ROW_LOOP
  
    ;; write V. front porch
    clrf        line_rgb_value
    movlw       VSYNC
    movwf       line_vsync_value
    movlw       V_FRONT_PORCH
    movwf       line_count
    call        WRITE_SCANLINE           
    ;; write V. sync pulse
    clrf        line_vsync_value
    movlw       V_SYNC_PULSE
    movwf       line_count
    call        WRITE_SCANLINE
    ;; write two bytes of back porch to reset address counter
    movlw       VSYNC | HSYNC
    movwf       sram_value
    movlw       .2
    movwf       sram_count
    call        WRITE_SRAM_BYTES    
    return

    END

640x480 RLE Wrencher (4409 Bytes)
01/06/2017 at 23:50 • 19 comments
I had to do it.
The logo plus decoding code takes up 2519 14-bit instructions (4409 bytes). I used a simple RLE compression and stored the compressed data in a header file. I only compressed the left half image, then decompressed the runs in reverse for the right half. The whole code is uploaded here, but the decompression part looks like:
```
void GenerateFrame()
{
  GenerateLine( VSYNC   , RGB(0, 0, 0),  33);  // V back porch

  const uint8_t *data_ptr = rle_wrencher;
  uint16_t row = 480;
  do {
    write_SRAM_bytes( VSYNC | HSYNC   | 0 , 16);  // H front porch
    write_SRAM_bytes( VSYNC | HSYNC&0 | 0 , 96);  // H sync pulse
    write_SRAM_bytes( VSYNC | HSYNC   | 0 , 48);  // H back porch

    const uint8_t *row_ptr = data_ptr;

    // left half image
    uint8_t color = 0;
    uint8_t num_runs = *data_ptr++;
    if (num_runs){
      do {
        uint8_t run_length = *data_ptr++;
        write_SRAM_bytes( VSYNC | HSYNC   | color, run_length);
        color ^= 0x3f;
      } while (--num_runs);
    } else {
      write_SRAM_bytes( VSYNC | HSYNC   | 0, 160);
      write_SRAM_bytes( VSYNC | HSYNC   | 0, 160);
    }

    // right half image (runs processed in reverse)
    color ^= 0x3f;
    num_runs = *row_ptr;
    if (num_runs){
      do {
        uint8_t run_length = *--data_ptr;
        write_SRAM_bytes( VSYNC | HSYNC   | color, run_length);
        color ^= 0x3f;
      } while (--num_runs);
    } else {
      write_SRAM_bytes( VSYNC | HSYNC   | 0, 160);
      write_SRAM_bytes( VSYNC | HSYNC   | 0, 160);
    }
    data_ptr += *row_ptr;

  } while (--row);

  GenerateLine( VSYNC   , RGB(0, 0, 0),  10);  // V front porch
  GenerateLine( VSYNC&0 , RGB(0, 0, 0),  2);   // V sync pulse
  write_SRAM_bytes( VSYNC | HSYNC | 0, 2);     // end of vsync; resets counter
}
```
I really wanted to fit this into 1kB, but ran out of time.
Quadtrees?
I also experimented with quadtree compression, which should take better advantage of the solid 2D areas (not just 1D runs). The best strategy I found was to compress the top and bottom halves as 256x256 blocks (bottom visualized here):
Again, symmetry would be used to create the right side. I got the data down to 1047 bytes this way, but didn't think I could add the decompression code *and* find a way to make it all fit in 1kB, so I abandoned the effort.
I think you probably could get this image into 1kB, though, if you really worked at it.
Now, back to other projects...
For those that want to try compressing this 640x480 rendering of the wrencher, here is the one I used as a png. I do not know anything about the intellectual property status of this image, so, you know...don't sue me or anything. I make no representations whatsoever about rights to use this logo or this specific rendering of it. I hear it's a touchy subject - but then again, there are a number of instances of people using the logo, then not being sued for trademark infringement, so that sounds like failure-to-enforce. But failing to enforce copyright doesn't weaken the copyright, so ... whatever. Then again, this is their site, so if they have an issue with this, they should send themselves a takedown notice.

SRAM Size

I took a look at this page to determine the SRAM requirements for various SVGA modes. With this adapter, each dot clock in the whole frame needs a slot in the SRAM. For example, in 640x480, a total of 800x525 = 420000 bytes are required, since the video frame consists of 525 lines of 800 dot clocks each. Here's a summary of some common modes and what size memories they fit into:

MODE	Line Width	Lines	Bytes Req'd	512k x 8	1M x 8	2M x 8	4M x 8
640x480	800	525	420000	X	X	X	X
768x576	976	597	582672		X	X	X
800x600	1024	625	640000		X	X	X
1024x768	1344	806	1083264			X	X
1280x1024	1688	1066	1799408			X	X

I didn't bother with modes above this, because the required clock speed becomes the limiting factor.

As you can see, 640x480 is the only common resolution that fits in the 512k SRAM I used - to go any higher, you'd need a bigger one. The (5x) 74AC163 counters could generate 1M addresses (20 bits); beyond that, and you'd need to expand the counter.

Clock Frequency

I looked at the minimum VESA-standard refresh rates for the above modes (typically 60 Hz), what dot clock frequencies are required, and the period of this frequency:

MODE	Refresh	Dot Clock	Period
640x480	60 Hz	25.175 MHz	39.7 ns
768x576	60 Hz	34.96 MHz	28.6 ns
800x600	56 Hz	36 MHz	27.8 ns
1024x768	60 Hz	44.9 MHz	22.3 ns
1280x1024	60 Hz	108 MHz	9.26 ns

According to Ti's datasheet, the 74AC163 will count at 103 MHz over commercial temperatures at 5V. But, the maximum propagation delay from the clock to the outputs is 15 ns. I used a 12ns SRAM, although I see them (in less hacker-friendly packages) down to 6ns. With the SRAM I used, you might be limited to a (12+15 =) 27 ns cycle time, which could do 800x600 but no higher. Moving to a 6ns SRAM allows a cycle time of (6+15 = 21), which with some tricks and tweaks might get you to 1024x768.

FPGAs?

I couldn't implement the counting and reset logic in an FPGA for this project because I thought the FPGA code might count against the 1kB limit - maybe it didn't; who knows. But, it certainly seems like a small FPGA might do the counting and reset logic easily, and do it faster and more compactly than the discrete logic packages. Combined with a larger, faster SRAM, this might make a nice system. With the extra logic afforded by the FPGA, you might even add a way to write to the SRAM while the VGA output is active - the biggest missing piece in this simple system. Of course, you could also implement a more traditional split-counter and synch-generation system while you were at it.

Composite Video?

Last, and possibly least, would be generation of composite video (NTSC or PAL). There's enough memory on the board I built to store a nice NTSC frame. The clock frequency of 25.175 MHz is more than 7x the color-burst frequency at 3.58 MHz, so you might even be able to generate a full-color signal by directly synthesizing the 3.58 MHz color subcarrier along with the luminance signal. I think the only hardware modification required would be to ditch the three 2-bit DACs and replace them with a single 6-bit DAC. For 6 bits, you can use 1% resistors in an R/2R ladder.

Arbitrary Waveforms?

At its heart, this system is an arbitrary waveform generator, so why not use it as such? I'm guessing you could go to 35 MHz clock frequency with this board, maybe a little more. That would give you a theoretical maximum sinewave output frequency of 17.5 MHz (yes, you'd have to filter it heavily). A more realistic limit might be 10 points per cycle, or 3.5 MHz maximum output. It's not great compared to commercial offerings, but might find some use around the lab.

Next Up

I have one more image I want to get displayed on the monitor with some minimal code, then I'm going to consider this project done. It was a fun distraction, but I have other projects to get back to :-)

Zero Instruction Elapsed Timer
01/06/2017 at 02:04 • 8 comments

Since these images take so long (hours) to generate, I wanted to be able to time them and report exact run-times. But, I didn't want to add any extra instructions to do it, so I decided to use a very simple, low-tech approach.
I found this clock for $3.88 at Walmart. It normally runs off an AA battery. Instead, I'm running it from an unused output line on the PIC:
The resistor and diodes form a crude 1.4 V regulator. The clock wouldn't run with small capacitors - the hand would twitch but not fully advance. The mechanism must need a hefty current pulse to actually advance it.
I was able to set and clear a bit in the code just by changing the constants that get written to the ports, so I didn't increase the code size at all.
To use the timer, you set the clock to 12:00 before a run, then come back and read off the elapsed time after it's done. For very long runs, you just have to check the clock every 12 hours. This would at first appear to violate the Nyquist criterion, since the period of the clock is 12 hours, meaning you have to sample at least every 6 hours, but it doesn't :-)
I probably won't have timing data before the contest deadline, but I'll post it here when I have it.
The ray-tracing took 1 hour and 28 minutes to run:
I'm timing the fractal code now. I think that takes longer...
Technically, I was correct, the fractal code took longer, but not much: 1 hour and 32 minutes. Funny, they're pretty well matched.
I didn't wait around while I was running these things before - I always kicked them off before going to bed, or something like that.
Finally: Ray-tracing on an 8-bit PIC (not 1kB)
01/05/2017 at 17:33 • 0 comments

OK, I'll clog your feed with one more portrait-aspect image (sorry for all of them). I finally got the ray-tracer doing the right thing. It's still 8.3kB of code, which is the best I'm going to be able to do before the contest deadline, but it works. The fractals in 1kB will have to be the code for the contest, but here's ray-tracing on an 8-bit PIC:
The wavy lines are moiré patterns caused by the monitor pixels beating with the camera pixels; they're not in the generated image. I uploaded updated source code with the latest epsilon fix. The code ain't pretty, but "it works."
24-bit Float Fail
01/05/2017 at 13:00 • 18 comments
It turns out that the 24-bit floating point implemented in the XC8 compiler requires a few tweaks of the ray-tracing code. My goal is to re-write this all in 16 (or 24) bit fixed point, anyway, but the code I had ready used floats. Here's the problem:
The noise on the spheres is caused when rays bounce off, then are found intersect the sphere again immediately. This happens because if the origin of the reflected ray is right on the surface of the sphere, it's ambiguous which side the ray originates on - the dropout points above are where the origin of the reflected ray were found to be inside the sphere, so the ray got trapped in there instead of bouncing off normally. The classic solution to this classic problem is to add a small offset (epsilon) to the reflected ray origin to ensure the reflected ray remains outside - and the required magnitude of this offset depends on the numerical precision used.
I tested this code on my linux box with IEEE 32-bit floating point, where my chosen epsilon was sufficient. Porting the code to the PIC with 24-bit floats looks like it requires a few tweaks. I changed one line to bump epsilon:
```
      // reflect from sphere
      float eps = 0.1;
```
and I have it running again.
CONTEST DISCLAIMER: this code is 8.3kB in size.
Second Try - Slightly Less Fail
My epsilon is still too large - dropouts on the left-hand sphere only now. It's running again...
Can I have 9kB if I enter nine times?
01/05/2017 at 01:35 • 4 comments

I got the ray-tracer to fit - into the PIC, anyway. At 8473 bytes, it's way over the 1kB limit, unfortunately. The good news is that this is without any serious attempts at size optimization - it's using the free XC8 compiler's native 24-bit floating point in pure C, and it was written more for simplicity than compactness. I figured once I got it working, I could start cutting corners to make a version fit.
The bad news is that since this thing is going to take so long to run, I need to start running the code I have if I want a chance at it finishing before the contest deadline - my rough estimates put it at 8 hours, but it could be twice as long or more. So, I kicked it off a few minutes ago.
So, it won't be in 1kB, but I might get a ray-tracer on an 8-bit PIC in true VGA resolution. In some universe, that's worth doing anyway :-)
I'll post an image when it appears. I uploaded the code already - you can see that it could use some serious optimization.
The one issue I had was that the XC8 compiler doesn't support recursive functions, which make ray tracing so much easier. I had to convert to an iterative approach. I've written four (or maybe five) ray tracers of varying complexity for different applications (graphics / optics / solar concentrator design) since the late 1980s, but I think this was the first time I couldn't use recursion. It was a neat little twist.
Final Hardware Design
01/03/2017 at 03:09 • 6 comments

Here's the hardware as-built. Click here for a nicer pdf version. It took a few more ICs than I originally thought, but it's still very simple.
All five waveforms (vsync, hsync, red, green, blue) are stored as a sequence of bytes in a 512k x 8 SRAM; one byte for each pixel clock in the entire frame - blanking intervals and all. Five 74AC163 synchronous binary counters cycle the addresses into the SRAM. The output of the SRAM is latched in a 74ACT574 register - the ACT part is used here since the SRAM has TTL-level outputs. The vsync and hsync signals are output through 61.9 ohm resistors, source-terminating the 75-ohm cables. The color signals are each formed with a simple 3-resistor DAC, providing a total palette of 64 colors, again matched to 75 ohms. Gamma correction is willfully ignored.
I used a 74AC02 quad NOR gate as a pair of MUXes for the clock and reset signals to the counters. When loading data into the SRAM, the PIC bitbangs the clock and reset lines to address sequential locations. After the PIC finishes loading the data, it switches control of the clock and reset lines back to the free-running circuit.
The address reset circuit is fed by a synchronous edge detector (a 74AC74 flip-flop and a 74AC00 NAND gate) that detects the rising edge of the vsync pulse to reset the counter. This arrangement means that the vertical back porch has to be stored first in the SRAM, but this is easily handled by the PIC software.
Finally, since the 74AC logic edges are so fast, and the counters and flip-flops edge sensitive, I took extra care routing the clock signals around the board. The 25.175 MHz dot clock comes from a pre-packaged oscillator "can". I used a 74AC244 octal buffer as a clock-distribution amp, with each of the seven clock lines required on the board driven by a dedicated buffer. To prevent distortions of the clock signals, each line was run with a twisted-pair of wire-wrap wire. These home-brew pairs have an impedance of about 102 ohms, so 86.6 ohm resistors were used to source-terminate each of them at the 74AC244 outputs. The resulting clock signals look good at each of the clocked ICs around the board - this is not the place where you want ringing and possible double-triggering.
It might not have been obvious from other photos of the board, but most of the ICs are SOIC/SOJ and are mounted on some adapters I designed and had made at OSH Park:
These worked really well. The pads are spaced out just enough to make soldering manageable. I've soldered directly to SOIC pins before, and I don't like it. Too small. The boards assume the standard corner power pins for logic ICs and include sites for MLCC bypass caps.
Oh, and as for power consumption - when the PIC is calculating the fractals and loading the SRAM, the circuit draws around 19 mA. Once the VGA generation starts, this jumps to 128 mA. Still not too bad, I guess - you could run it off a USB port if you asked the USB host nicely for more than 100 mA.
EDIT 20170103
I updated the schematic to include the CE_bar line on the SRAM, which just gets tied low.
I have also been thinking about the whole clock distribution/twisted pair thing. It might be avoided by using a 74HC gate as the clock driver - with the slower edge rates, maybe you don't need to worry so much about wire lengths. I discussed why 74AC163's were required a few logs ago, but that doesn't mean everything has to be 74AC. If I build another one, this would be an interesting place to try to simplify even more. Maybe a 74HC02 substituted for the 74AC02 could serve as the MUXes and clock driver.
I found a reference that says 74HC edge rates can be as short as 5ns. This is 1.5m of wire at a velocity factor of 1. 1/6 of this length is 25cm. That might make it doable, assuming the rest of the timing still holds.

Official Contest Entry Log

01/02/2017 at 16:27 • 11 comments

OK. I got pretty fractals onto a VGA monitor using 1019 bytes of code on a PIC16F1718. Unless I get something better done by Thursday, I'm going to consider this my contest entry. I let this one run overnight - when I turned on the monitor this morning, here's what I saw:

The straight C-code takes up 582 14-bit instructions, which is equivalent to 582 * 14 / 8 = 1018.5 bytes.

The code just fit - actually, as originally written, it was one instruction over. I had to inline the SetupPeriperhals() call to shave off four instructions. Note that this code is compiled with the free Microchip XC8 compiler. The free version tells me that the code could be 232 words smaller if I used the Pro version - you have to wonder how it knows :-)

Yes, writing the whole thing in C is lazy. Perhaps I can redeem myself in the next few days. I did really want to do a hardware project, though. I'll draw up a schematic for the board as-built today for completeness.

Here's the code. I'll upload it as a file, too.

//
// vga_test.c - create first VGA frame
//
#include <xc.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <pic16f1718.h>

// CONFIG1
#pragma config FOSC = INTOSC
#pragma config WDTE = OFF
#pragma config PWRTE = ON
#pragma config MCLRE = ON
#pragma config CP = OFF
#pragma config BOREN = ON
#pragma config CLKOUTEN = OFF
#pragma config FCMEN = ON

// CONFIG2
#pragma config WRT = ALL
#pragma config PPS1WAY = OFF
#pragma config ZCDDIS = ON
#pragma config PLLEN = ON
#pragma config STVREN = OFF
#pragma config BORV = LO
#pragma config LPBOR = OFF
#pragma config LVP = ON

//
// h/w interface definition
//
#define REG_OE_bar 0b00010000
#define WE_bar     0b00000001
#define OE_bar     0b00000010
#define CP_en      0b00001000
#define MR_en      0b00100000
#define CP_bar     0b00000100
#define MR_bar     0b00010000

#define VSYNC 0b10000000
#define HSYNC 0b01000000
#define RGB(r, g, b) (((r) << 4) | ((g) << 2) | (b))

#if 0 // manually inlined below to save 4 instructions
void SetupPeripherals() {
  // intosc 32 MHz
  OSCCON = 0b11110000;

  // select digital I/O
  ANSELA = 0;
  ANSELB = 0;
  ANSELC = 0;

  // set TRIS bits: all outputs
  PORTA = 0x00;
  TRISA = 0x00;
  PORTB = 0x00;
  TRISB = 0x00;
  PORTC = 0x00;
  TRISC = 0x80;
}
#endif

//
// set control lines for free-running VGA signal generation
//
void RunMode()
{
  TRISC = 0xff; // data lines all inputs
  // reset address counter, then let it rip
  LATB = WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en   | MR_bar   ;
  LATB = WE_bar | OE_bar&0 | CP_en&0 | CP_bar&0 | MR_en&0 | MR_bar&0 ;
  LATA = REG_OE_bar&0;
}

//
// set control lines for bitbanging waveforms into SRAM, and
//   reset SRAM address counter to 0
//
void LoadMode()
{
  LATA = REG_OE_bar;
  // toggle CP with MR low to reset address counter
  LATB = WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar   ;
  LATB = WE_bar | OE_bar | CP_en | CP_bar&0 | MR_en | MR_bar   ;
  LATB = WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar   ;
  // bring out of reset
  LATB = WE_bar | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
  TRISC = 0x00; // data lines all outputs
}

//
// bitbang a number of identical bytes into sequential SRAM addresses
//
void write_SRAM_bytes(uint8_t value, uint8_t count)
{
  PORTC = value;
  LATB = WE_bar | OE_bar | CP_en | CP_bar | MR_en | MR_bar&0 ;
  do {
    // toggle WE to write data
    LATB = WE_bar&0 | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
    LATB = WE_bar   | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
    // toggle CP to advance address
    LATB = WE_bar   | OE_bar | CP_en | CP_bar&0 | MR_en | MR_bar&0 ;
    LATB = WE_bar   | OE_bar | CP_en | CP_bar   | MR_en | MR_bar&0 ;
  } while (--count);
}

void GenerateLine(uint8_t vsync, uint8_t rgb, uint8_t count)
{
  do {
    write_SRAM_bytes( vsync | HSYNC   | rgb&0 , 16);  // front porch
    write_SRAM_bytes( vsync | HSYNC&0 | rgb&0 , 96);  // sync pulse
    write_SRAM_bytes( vsync | HSYNC   | rgb&0 , 48);  // back porch
    write_SRAM_bytes( vsync | HSYNC   | rgb   , 200); // video
    write_SRAM_bytes( vsync | HSYNC   | rgb   , 200); // video
    write_SRAM_bytes( vsync | HSYNC   | rgb   , 240); // video
  } while (--count);
}

#define S 12
#define FP(x) ((int16_t)((x) * (1<<S)))

//#define WHOLE_SET

#ifdef WHOLE_SET
#define ASPECT   (640./480.)
#define WIDTH     2.5
#define IMAG_MIN  FP(-1.25)
#define IMAG_STEP FP(WIDTH / 480.)
#define REAL_MIN  FP(-2.5)
#define REAL_STEP FP(ASPECT * WIDTH / 640.)
#define ESCAPE_RADIUS FP(4.)
#define MAXITER   255
#else
#define ASPECT   (640./480.)
#define WIDTH     0.125
#define IMAG_MIN  FP(-.0625)
#define REAL_MIN  FP(-1.5)
#define IMAG_STEP FP(WIDTH / 480.)
#define REAL_STEP FP(ASPECT * WIDTH / 640.)
#define ESCAPE_RADIUS FP(4.)
#define MAXITER   255
#endif

void GenerateFrame()
{
  GenerateLine( VSYNC   , 0,  33);  // V back porch

  int16_t dc = REAL_STEP;
  int16_t dd = IMAG_STEP;
  int16_t d = IMAG_MIN;  
  int16_t row = 480;
  do {
    write_SRAM_bytes( VSYNC | HSYNC   | 0 , 16);  // H front porch
    write_SRAM_bytes( VSYNC | HSYNC&0 | 0 , 96);  // H sync pulse
    write_SRAM_bytes( VSYNC | HSYNC   | 0 , 48);  // H back porch

    int16_t c = REAL_MIN;
    int16_t col = 640;
    do {
      int16_t a = 0;
      int16_t b = 0;
      uint8_t iter = MAXITER;
      do {
        int32_t aa32 = ((int32_t)a * (int32_t)a);
        if (aa32 & 0xf8000000){
          break;
        }
        int16_t aa = aa32 >> S;

        int32_t bb32 = ((int32_t)b * (int32_t)b);
        if (bb32 & 0xf8000000){
          break;
        }
        int16_t bb = bb32 >> S;

        if (aa > ESCAPE_RADIUS ||
            bb > ESCAPE_RADIUS ||
            aa + bb > ESCAPE_RADIUS){
          break;
        }
        b = (((int32_t)a * (int32_t)b) >> (S-1)) + d;
        a = aa - bb + c;
      } while(--iter);

      uint8_t red, green, blue;
      red = (iter & 3);
      green = ((iter & 0x0c) >> 2);
      blue = ((iter & 0x30) >> 4);
      write_SRAM_bytes( VSYNC | HSYNC   | RGB(red, green, blue), 1); // one pixel

      c += dc;
    } while (--col);
    d += dd;
  } while (--row);

  GenerateLine( VSYNC   , 0,  10);  // V front porch
  GenerateLine( VSYNC&0 , 0,  2);   // V sync pulse
  write_SRAM_bytes( VSYNC | HSYNC | 0, 2);     // end of vsync; resets counter
}

int main() {

  // this call inlined here to shave off instructions
  // SetupPeripherals();

  // intosc 32 MHz
  OSCCON = 0b11110000;

  // select digital I/O
  ANSELA = 0;
  ANSELB = 0;
  ANSELC = 0;

  // set TRIS bits: all outputs
  PORTA = 0x00;
  TRISA = 0x00;
  PORTB = 0x00;
  TRISB = 0x00;
  PORTC = 0x00;
  TRISC = 0x80;

  LoadMode();
  GenerateFrame();
  RunMode();

  while(1){
    continue;
  }

  return 0;
}

Gamma correction? We don't need no stinkin' gamma correction.
01/02/2017 at 03:50 • 0 comments

It actually could use gamma correction, but I'm not going to do it. The problem is that the VGA video intensity response curve is non-linear: a hold-over from CRTs. To compensate, the video signal levels need to be be pre-distorted so that the overall system response is linear. Without correction, here's the 64-color palette of the adapter:
It's not bad, but it would look better corrected. Of course, you can't do this kind of non-linear transform with just a resistor network. I had thought about decoding each 2-bit color component into a 1-of-4 with a 74AC138 decoder, then giving each level it's own resistor, but didn't want to add that much more hardware.
Next, I considered making a non-linear network with resistors and diodes, but thought that color drift with temperature would be a bizarre side-effect.
In the end, I decided to just leave it.

PIC Graphics Demo

Fractals re-written in assembly: 560 bytes

640x480 RLE Wrencher (4409 Bytes)

Quadtrees?

Other Video Formats (SVGA/Composite)?

SRAM Size

Clock Frequency

FPGAs?

Composite Video?

Arbitrary Waveforms?

Next Up

Zero Instruction Elapsed Timer

Finally: Ray-tracing on an 8-bit PIC (not 1kB)

24-bit Float Fail

Second Try - Slightly Less Fail

Can I have 9kB if I enter nine times?

Final Hardware Design

EDIT 20170103

Official Contest Entry Log

Gamma correction? We don't need no stinkin' gamma correction.