<br>x<sup>2</sup>/4 for 9 bit values of x, with the LSB when bit 8 is zero first followed by the MSB.
<pre>.gen_sq4
 	xor a 	ld de,umul_tab_lo umul_tab + #1ff 	ld (de),a 	dec d 	ld (de),a 	ld h,d 	ld l,e 	inc e 	ld c,e 	ld b,2
 	.sq4_lp 	ld a,b 	cp 2 	ld a,e 	rra 	add (hl) 	ld (de),a 	inc h 	inc d 	ld a,(hl) 	adc c 	ld (de),a 	dec d 	ld h,d 	inc l 	inc e 	jr nz,sq4_lp 	inc d 	inc d 	djnz sq4_lp 	ret
align #100
.umul_tab ds #400 </pre>
Now for the actual multiply routine:
'''Output:''' DE = ''Product''
<pre>	ld h,umul_tab umul_tab_lo / #100 ; 2 	ld b,h 		; 3 	add l 		; 4 	ld c,a 		; 5 	jr nc,@noovf 	; 7 	inc b 		; 8 	inc b 		; 9
.@noovf
 	sub l 		; 10 	sub l 		; 11 	jr nc,@noneg 	; 13 	neg 		; 15
.@noneg
 	ld l,a 		; 16 	ld a,(bc) 	; 18 	sub (hl) 	; 20 	ld e,a 		; 21 	inc b 		; 22 	inc h 		; 23 	ld a,(bc) 	; 25 	sbc (hl) 	; 27 	ld d,a 		; 28 </pre>
This code could easily be converted to a macro as it's only 24 bytes. I've tried to optimise it further but with no luck!