Difference between revisions of "Programming:Integer Multiplication"

Latest revision as of 08:13, 6 March 2025

1 Classic 8bit * 8bit Unsigned
2 Classic 16bit * 8bit Unsigned
3 Fast 8bit * 8bit Unsigned (using log / antilog tables)
4 Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables
5 Fast, accurate 8bit * 8bit Unsigned with 8 KB tables
6 Very fast 8bit * 8bit Unsigned with only 1K of tables
7 Fast 8bit * 8bit Unsigned with only 512 bytes of tables
8 Fast 8bit * 8bit Unsigned with only 512 bytes of tables (improved version)
9 16bit * 16bit Unsigned -> 32 bits result
10 16bit * 16bit Unsigned -> 24 bits result
11 Web links

Classic 8bit * 8bit Unsigned

Input: H = Multiplier, E = Multiplicand, L = 0, D = 0

Output: HL = Product

CPC Cycles: 188-244 (216 on average) = 47-61 (54) usec

Size: 33 bytes

The additional clocks and bytes maybe required to set up D and L and for RET.

sla h  ; optimised 1st iteration
jr nc,$+3
ld l,e

add hl,hl  ; unroll 7 times
jr nc,$+3  ; ...
add hl,de  ; ...

Classic 16bit * 8bit Unsigned

Input: A = Multiplier, DE = Multiplicand, HL = 0, C = 0

Output: A:HL = Product

CPC Cycles: 212-300 (256 on average) = 53-75 (64) usec

Size: 47 bytes

add a,a  ; optimised 1st iteration
jr nc,$+4
ld h,d
ld l,e

add hl,hl  ; unroll 7 times
rla   ; ...
jr nc,$+4  ; ...
add hl,de  ; ...
adc a,c  ; ...

Fast 8bit * 8bit Unsigned (using log / antilog tables)

The original routine was written by Jeff Frohwein for the Nintendo Gameboy. You can find it on Devrs.com.

Because of the usage of log / antilog tables this routine is less accurate, but very fast. It takes advantage of the fact that if you take the log of two numbers, add the results and then take the antilog of the total you have done the equivalent of multiplying the two numbers:

 x^a * x^b = x^(a+b)

 a * b = x^(logx(a) + logx(b))

Input: B = Multiplier, C = Multiplicant

Output: DE = Product

CPC Cycles: 124 = 31 usec

Size: 1024 bytes (768 bytes for the tables)

Warning: It is VERY approximate routine, e.g., 0*7=7, 1*28=27, 10*81=790, 100*100=9742, ...


FastMult:
  ld      l,c
  ld      h,&82
  ld      d,(hl)          ; d = 32 * log_2(c)
  
  ld      l,b
  ld      a,(hl)          ; a = 32 * log_2(b)
  
  add     a,d
  ld      l,a
  ld      a,0
  adc     a,0
  ld      h,a             ; hl = d + a
  
  add     hl,hl
  set     2,h             ; hl = hl + $0400
  set     7,h             ; hl = hl + &8000
  
  ld      e,(hl)
  inc     hl
  ld      d,(hl)          ; de = 2^((hl)/32)
  
  ret

; 32*Log_2(x) Table
;
;   FOR A=0 TO 255
;     C=4 
;     B=2
;     FOR Z=1 TO 10
;       IF (2^C) > A THEN C=C-B ELSE C=C+B
;       B=B/2
;     NEXT Z
;     PRINT INT(C*32);",";
;   NEXT A
ORG &8200

logtable:
  db 0 , 0 , 32 , 50 , 64 , 74 , 82 , 89 , 96 , 101 , 106 , 110 , 114 , 118 , 121
  db 125 , 128 , 130 , 133 , 135 , 138 , 140 , 142 , 144 , 146 , 148 , 150 , 152
  db 153 , 155 , 157 , 158 , 160 , 161 , 162 , 164 , 165 , 166 , 167 , 169 , 170
  db 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183
  db 184 , 185 , 185 , 186 , 187 , 188 , 189 , 189 , 190 , 191 , 192 , 192 , 193
  db 194 , 194 , 195 , 196 , 196 , 197 , 198 , 198 , 199 , 199 , 200 , 201 , 201
  db 202 , 202 , 203 , 204 , 204 , 205 , 205 , 206 , 206 , 207 , 207 , 208 , 208
  db 209 , 209 , 210 , 210 , 211 , 211 , 212 , 212 , 213 , 213 , 213 , 214 , 214
  db 215 , 215 , 216 , 216 , 217 , 217 , 217 , 218 , 218 , 219 , 219 , 219 , 220
  db 220 , 221 , 221 , 221 , 222 , 222 , 222 , 223 , 223 , 224 , 224 , 224 , 225
  db 225 , 225 , 226 , 226 , 226 , 227 , 227 , 227 , 228 , 228 , 228 , 229 , 229
  db 229 , 230 , 230 , 230 , 231 , 231 , 231 , 231 , 232 , 232 , 232 , 233 , 233
  db 233 , 234 , 234 , 234 , 234 , 235 , 235 , 235 , 236 , 236 , 236 , 236 , 237
  db 237 , 237 , 237 , 238 , 238 , 238 , 238 , 239 , 239 , 239 , 239 , 240 , 240
  db 240 , 241 , 241 , 241 , 241 , 241 , 242 , 242 , 242 , 242 , 243 , 243 , 243
  db 243 , 244 , 244 , 244 , 244 , 245 , 245 , 245 , 245 , 245 , 246 , 246 , 246
  db 246 , 247 , 247 , 247 , 247 , 247 , 248 , 248 , 248 , 248 , 249 , 249 , 249
  db 249 , 249 , 250 , 250 , 250 , 250 , 250 , 251 , 251 , 251 , 251 , 251 , 252
  db 252 , 252 , 252 , 252 , 253 , 253 , 253 , 253 , 253 , 253 , 254 , 254 , 254
  db 254 , 254 , 255 , 255 , 255 , 255 , 255


; AntiLog 2^(x/32) Table
;
;   FOR A=0 to 510
;   PRINT INT(2^(A/32)+.5);",";
;   NEXT A
ORG &8400

antilog:
  dw 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2
  dw 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2
  dw 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4
  dw 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 6
  dw 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 8 , 8 , 9
  dw 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 , 12 , 12 , 12 , 12
  dw 13 , 13 , 13 , 13 , 14 , 14 , 14 , 15 , 15 , 15 , 16 , 16 , 16 , 17 , 17 , 17
  dw 18 , 18 , 19 , 19 , 19 , 20 , 20 , 21 , 21 , 22 , 22 , 23 , 23 , 24 , 24 , 25
  dw 25 , 26 , 26 , 27 , 27 , 28 , 29 , 29 , 30 , 31 , 31 , 32 , 33 , 33 , 34 , 35
  dw 36 , 36 , 37 , 38 , 39 , 40 , 41 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49
  dw 50 , 52 , 53 , 54 , 55 , 56 , 57 , 59 , 60 , 61 , 63 , 64 , 65 , 67 , 68 , 70
  dw 71 , 73 , 74 , 76 , 78 , 79 , 81 , 83 , 85 , 87 , 89 , 91 , 92 , 95 , 97 , 99
  dw 101 , 103 , 105 , 108 , 110 , 112 , 115 , 117 , 120 , 123 , 125 , 128 , 131
  dw 134 , 137 , 140 , 143 , 146 , 149 , 152 , 156 , 159 , 162 , 166 , 170 , 173
  dw 177 , 181 , 185 , 189 , 193 , 197 , 202 , 206 , 211 , 215 , 220 , 225 , 230
  dw 235 , 240 , 245 , 251 , 256 , 262 , 267 , 273 , 279 , 285 , 292 , 298 , 304
  dw 311 , 318 , 325 , 332 , 339 , 347 , 354 , 362 , 370 , 378 , 386 , 395 , 403
  dw 412 , 421 , 431 , 440 , 450 , 459 , 470 , 480 , 490 , 501 , 512 , 523 , 535
  dw 546 , 558 , 571 , 583 , 596 , 609 , 622 , 636 , 650 , 664 , 679 , 693 , 709
  dw 724 , 740 , 756 , 773 , 790 , 807 , 825 , 843 , 861 , 880 , 899 , 919 , 939
  dw 960 , 981 , 1002 , 1024 , 1046 , 1069 , 1093 , 1117 , 1141 , 1166 , 1192
  dw 1218 , 1244 , 1272 , 1300 , 1328 , 1357 , 1387 , 1417 , 1448 , 1480 , 1512
  dw 1545 , 1579 , 1614 , 1649 , 1685 , 1722 , 1760 , 1798 , 1838 , 1878 , 1919
  dw 1961 , 2004 , 2048 , 2093 , 2139 , 2186 , 2233 , 2282 , 2332 , 2383 , 2435
  dw 2489 , 2543 , 2599 , 2656 , 2714 , 2774 , 2834 , 2896 , 2960 , 3025 , 3091
  dw 3158 , 3228 , 3298 , 3371 , 3444 , 3520 , 3597 , 3676 , 3756 , 3838 , 3922
  dw 4008 , 4096 , 4186 , 4277 , 4371 , 4467 , 4565 , 4664 , 4767 , 4871 , 4978
  dw 5087 , 5198 , 5312 , 5428 , 5547 , 5668 , 5793 , 5919 , 6049 , 6182 , 6317
  dw 6455 , 6597 , 6741 , 6889 , 7039 , 7194 , 7351 , 7512 , 7677 , 7845 , 8016
  dw 8192 , 8371 , 8555 , 8742 , 8933 , 9129 , 9329 , 9533 , 9742 , 9955 , 10173
  dw 10396 , 10624 , 10856 , 11094 , 11337 , 11585 , 11839 , 12098 , 12363 , 12634
  dw 12910 , 13193 , 13482 , 13777 , 14079 , 14387 , 14702 , 15024 , 15353 , 15689
  dw 16033 , 16384 , 16743 , 17109 , 17484 , 17867 , 18258 , 18658 , 19066 , 19484
  dw 19911 , 20347 , 20792 , 21247 , 21713 , 22188 , 22674 , 23170 , 23678 , 24196
  dw 24726 , 25268 , 25821 , 26386 , 26964 , 27554 , 28158 , 28774 , 29404 , 30048
  dw 30706 , 31379 , 32066 , 32768 , 33485 , 34219 , 34968 , 35734 , 36516 , 37316
  dw 38133 , 38968 , 39821 , 40693 , 41584 , 42495 , 43425 , 44376 , 45348 , 46341
  dw 47356 , 48393 , 49452 , 50535 , 51642 , 52772 , 53928 , 55109 , 56316 , 57549
  dw  58809 , 60097 , 61413 , 62757

Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables

Input: L = Multiplier, C = Multiplicand

Output: DE = Product

CPC Cycles: 80 = 20 usec

Size: 14 bytes of code + 16 KB tables

I'm currently working on a new routine, based on a routine by Kirk Meyer [1] which uses nibble multiplication tables.

I have a working, tested version which uses 16K of tables and can perform multiplication in 20μs.

The code to produce the tables is below:

.maketabs
ld hl,hltab1
.makelp1
ld a,l
rla
and #1e
add restab / 256
ld (hl),a
inc l
jr nz,makelp1
inc h
.makelp2
ld a,l
rra:rra:rra
and #1e
jr z,usez
add restab2 - restab / 256 - 2
.usez
add restab / 256
ld (hl),a
inc l
jr nz,makelp2
inc h			; restab
.makelp3
ld a,(hl)
inc h
ld d,(hl)
inc h
add l
ld (hl),a
inc h
ld a,0
adc d
ld (hl),a
dec h:dec h:dec h
inc l
jr nz,makelp3
inc h
inc h
ld a,h
cp restab2 / 256 - 2
jr nz,makelp3
ld b,h
ld c,l
inc b
inc b
ld h,restab / 256 + 2
.makelp4
ld e,(hl)
inc h
ld d,(hl)
dec h
ex de,hl
add hl,hl:add hl,hl:add hl,hl:add hl,hl
ex de,hl
ld a,e
ld (bc),a
inc b
ld a,d
ld (bc),a
dec b
inc l
inc c
jr nz,makelp4
inc h
inc h
inc b
inc b
ld a,h
cp restab2 / 256
jr nz,makelp4
ret

ds -$ and #ff
.hltab1
ds 256
.hltab2
ds 256
.restab
ds 512 * 16
.restab2
ds 512 * 15

The code to perform the multiplication (DE = L * C):

ld h,hltab1 / 256	; 2
ld b,(hl)		; 4
inc h			; 5
ld h,(hl)		; 7
ld l,c			; 8
ld a,(bc)		; 10
add (hl)		; 12
ld e,a			; 13
inc b			; 14
inc h			; 15
ld a,(bc)		; 17
adc (hl)		; 19
ld d,a			; 20

Fast, accurate 8bit * 8bit Unsigned with 8 KB tables

Input: L = Multiplier, C = Multiplicand

Output: DE = Product

CPC Cycles: 100 = 25 usec

Size: 19 bytes of code + 8 KB tables

16K is a lot of memory to use for tables, but I'm working on a way to reduce this to 8K while maintaining similar performance (around 26μs). The idea is rather than using tables for the low and high nibbles, use tables for alternate bits. So there would be 16 tables for the values #00, #01, #04, #05, #10, #11, #14, #15, #40, #41, #44, #45, #50, #51, #54, #55. The values can be shifted by 1 (rather than 4) to give values for the alternate bits using a simple ADD HL,HL.

This routine should be easy to adapt to a 16bit * 8bit unsigned multiplication or even a 16bit * 16bit, using simple additions.

Executioner 03:58, 8 May 2007 (CEST)

Ok, here is the new 8K routine, first the routine to build the tables:

.maketabs
ld hl,hltab1
.makelp5
ld a,l
ld bc,0
rla:rl b:rla:rl c
rla:rl b:rla:rl c
rla:rl b:rla:rl c
rla:rl b:rla:rl c
ld a,b
add a
add restab / 256
ld (hl),a
inc h
ld a,c
add a
add restab / 256
ld (hl),a
dec h
inc l
jr nz,makelp5
ld h,restab / 256 + 2
.makelp6
ld (hl),l
inc l
jr nz,makelp6
inc h:inc h
ld b,2
.makelp8
push bc
xor a
rr b:rra:rrca
rr b:rra:rrca
rr b:rra:rrca
rr b:rra:rrca
.makelp7
push hl
push af
call mulLbyA
ex de,hl
pop af
pop hl
ld (hl),e
inc h
ld (hl),d
dec h
inc l
jr nz,makelp7
inc h:inc h
pop bc
inc b
bit 4,b
jr z,makelp8
ret

.mulLbyA	; MAX times	; MIN times
ld e,l		; 1		; 1
ld d,0		; 3		; 3
add a		; 4		; 4
ld h,a		; 5		; 5
jr c,ncad0	; 8		; 8
ld l,d		
.ncad0
add hl,hl	; 11		; 11
jr nc,ncad1	; 13		; 14
add hl,de	; 16
.ncad1
add hl,hl	; 19		; 17
jr nc,ncad2	; 21		; 20
add hl,de	; 24
.ncad2
add hl,hl	; 27		; 23
jr nc,ncad3	; 29		; 26
add hl,de	; 32
.ncad3
add hl,hl	; 35		; 29
jr nc,ncad4	; 37		; 32
add hl,de	; 40
.ncad4
add hl,hl	; 43		; 35
jr nc,ncad5	; 45		; 38
add hl,de	; 48
.ncad5
add hl,hl	; 51		; 41
jr nc,ncad6	; 53		; 44
add hl,de	; 56
.ncad6
add hl,hl	; 59		; 47
ld a,h		; 60		; 48
ret nc		; 61		; 50?
add hl,de	; 64
ld a,h		; 65
ret

ds -$ and #ff

.hltab1
ds 256
.hltab2
ds 256
.restab
ds 512 * 16

And the routine to do the multiplication:

.umultCL		; Using 8 bit only (25us - DE = L * C)
ld h,hltab2 / 256	; 2
ld b,(hl)		; 4
dec h			; 5
ld h,(hl)		; 7
ld l,c			; 8
ld a,(hl)		; 10
add a			; 11
inc h			; 12
ld d,(hl)		; 14
rl d			; 16
ld h,b			; 17
add (hl)		; 19
ld e,a			; 20
inc h			; 21
ld a,(hl)		; 23
adc d			; 24
ld d,a			; 25
ret

Very fast 8bit * 8bit Unsigned with only 1K of tables

Input: A = Multiplier, L = Multiplicand

Output: DE = Product

CPC Cycles: 104-112 (108 on average) = 26-28 (27) usec

Size: 24 bytes of code and 1024 bytes for the tables

Here's a new routine I've developed which uses the formula ab = ((a + b)² - (a - b)²) / 4. It's based on a routine for the 6502 by Stephen Judd in a C= Hacking article. Because of differences between the way the 6502 does register indexing it was quite difficult to actually get this working, but it's a great compromise between speed and space since it only uses 1K of tables (as opposed to the 16K or 8K table routines above), and can still manage to do the job in a maximum of 28 microseconds.

Firstly, once again, we need some code to generate the tables. These tables contain values for

x²/4 for 9 bit values of x, with the LSB when bit 8 is zero first followed by the MSB.

.gen_sq4
	xor a
	ld de,umul_tab + #1ff
	ld (de),a
	dec d
	ld (de),a
	ld h,d
	ld l,e
	inc e
	ld c,e
	ld b,2

	.sq4_lp
	ld a,b
	cp 2
	ld a,e
	rra
	add (hl)
	ld (de),a
	inc h
	inc d
	ld a,(hl)
	adc c
	ld (de),a
	dec d
	ld h,d
	inc l
	inc e
	jr nz,sq4_lp
	inc d
	inc d
	djnz sq4_lp
	ret

align #100
.umul_tab ds #400

Now for the actual multiply routine:

	ld h,umul_tab_lo / #100	; 2
	ld b,h			; 3
	add l			; 4
	ld c,a			; 5
	jr nc,@noovf		; 7
	inc b			; 8
	inc b			; 9
.@noovf
	sub l			; 10
	sub l			; 11
	jr nc,@noneg		; 13
	neg			; 15
.@noneg
	ld l,a			; 16
	ld a,(bc)		; 18
	sub (hl)		; 20
	ld e,a			; 21
	inc b			; 22
	inc h			; 23
	ld a,(bc)		; 25
	sbc (hl)		; 27
	ld d,a			; 28

This code could easily be converted to a macro as it's only 24 bytes. I've tried to optimize it further but with no luck!

Executioner 06:25, 4 April 2008 (CEST)

Fast 8bit * 8bit Unsigned with only 512 bytes of tables

Input: A = Multiplier, B = Multiplicant

Output: A:E = Product

CPC Cycles: 136-172 (154 on average) = 34-43 (38.5) usec

Size: 44 bytes of code and 512 bytes for the tables

Fast 8 bit unsigned multiplication with 16 bit result. It uses formula
x*y = ((x+y)/2)² - ((x-y)/2)², if x+y is even
= ((x+y-1)/2)² - ((x-y-1)/2)² + y, if x+y is odd and x>=y

    cp b
    jr nc,l1

    ld e,a
    ld a,b
    ld b,e

l1  ld c,a
    sub b
    rra
    ld d,a
    ld a,c
    add a,b
    rra
    ld l,a
    ld h,high(sqrlo)
    ld a,(hl)
    ld e,l
    ld l,d
    jr nc,l2

    sub (hl)   ;odd
    ld l,e
    ld e,a
    inc h      ;loads high(sqrhi)
    ld a,(hl)
    ld l,d
    sbc a,(hl)
    ld d,a

    ld a,e
    add a,b
    ld e,a
    ld a,d
    adc a,0
    ret

l2  sub (hl)   ;even
    ld l,e
    ld e,a
    inc h
    ld a,(hl)
    ld l,d
    sbc a,(hl)
    ret

sqrlo ;low(x*x)   should be at the page border
    db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1
    db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1
    db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1
    db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81
    db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61
    db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41
    db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21
    db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1
    db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1
    db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1
    db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1
    db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81
    db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61
    db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41
    db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21
    db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1
sqrhi ;high(x*x)
    db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
    db 1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3
    db 4,4,4,4,5,5,5,5,6,6,6,7,7,7,8,8
    db 9,9,9,$a,$a,$a,$b,$b,$c,$c,$d,$d,$e,$e,$f,$f
    db $10,$10,$11,$11,$12,$12,$13,$13,$14,$14,$15,$15,$16,$17,$17,$18
    db $19,$19,$1a,$1a,$1b,$1c,$1c,$1d,$1e,$1e,$1f,$20,$21,$21,$22,$23
    db $24,$24,$25,$26,$27,$27,$28,$29,$2a,$2b,$2b,$2c,$2d,$2e,$2f,$30
    db $31,$31,$32,$33,$34,$35,$36,$37,$38,$39,$3a,$3b,$3c,$3d,$3e,$3f
    db $40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$4a,$4b,$4c,$4d,$4e,$4f
    db $51,$52,$53,$54,$55,$56,$57,$59,$5a,$5b,$5c,$5d,$5f,$60,$61,$62
    db $64,$65,$66,$67,$69,$6a,$6b,$6c,$6e,$6f,$70,$72,$73,$74,$76,$77
    db $79,$7a,$7b,$7d,$7e,$7f,$81,$82,$84,$85,$87,$88,$8a,$8b,$8d,$8e
    db $90,$91,$93,$94,$96,$97,$99,$9a,$9c,$9d,$9f,$a0,$a2,$a4,$a5,$a7
    db $a9,$aa,$ac,$ad,$af,$b1,$b2,$b4,$b6,$b7,$b9,$bb,$bd,$be,$c0,$c2
    db $c4,$c5,$c7,$c9,$cb,$cc,$ce,$d0,$d2,$d4,$d5,$d7,$d9,$db,$dd,$df
    db $e1,$e2,$e4,$e6,$e8,$ea,$ec,$ee,$f0,$f2,$f4,$f6,$f8,$fa,$fc,$fe

Litwr 10:25, 24 October 2015 (CEST)

Fast 8bit * 8bit Unsigned with only 512 bytes of tables (improved version)

Input: A = Multiplier, L = Multiplicant

Output: DE = Product

CPC Cycles: 124 = 31 usec

Size: 27 bytes of code and 512 bytes for the tables

by calc84maniac

Hey, so someone brought up the wiki's 512-byte table routine in the context of Game Boy development, and I thought I'd take a crack at seeing if the speed and size could be made closer to the 1KB table routine. I actually had a good bit of success, by making the following observations:

The x >= y constraint isn't necessary if negative values for floor((x - y) / 2) have their square calculated properly, so no need to swap inputs.
Going from floor((x + y) / 2) to floor((x - y) / 2) simply requires a subtraction of y, so no need to divide by 2 twice or save the input value of x.
To square a negative number z, it's sufficient to calculate z ^ 2 = (z + 256) ^ 2 - z * 512 - 65536, meaning a conditional offset of -(z * 512) in the subtracted square, or z * 512 in the final result.
When conditionally adding y to the result when x + y is odd, the carry can be propagated into the low bit of the 8-bit register holding the upper half of z * 512.
Both of these conditional adds can be turned into branchless adds of either the value or zero.

This results in the following implementation (which I adjusted to take the same inputs and outputs as the 1KB routine for a fair comparison), which comes out to 27 bytes and a constant 31 usec:

; Input: A = x, L = y
; Output: DE = x * y
    ; C = floor((x + y) / 2)
    add a,l          ; 1
    rra              ; 2
    ld c,a           ; 3
    ; E = y if x + y is odd, else 0
    sbc a,a          ; 4
    and l            ; 5
    ld e,a           ; 6
    ; L = floor((x - y) / 2)
    ld a,c           ; 7
    sub l            ; 8
    ld l,a           ; 9
    ; D = L if negative, else 0
    sbc a,a          ; 10
    and l            ; 11
    ld d,a           ; 12
    ; Calculate low half of product
    ld h,high(sqrlo) ; 14
    ld b,h           ; 15
    ld a,(bc)        ; 17
    add a,e          ; 18
    rl d             ; 20, multiplies D by 2 and propagates carry
    sub (hl)         ; 22
    ld e,a           ; 23
    ; Calculate high half of product
    inc h            ; 24, loads high(sqrhi)
    ld b,h           ; 25
    ld a,(bc)        ; 27
    sbc a,(hl)       ; 29
    add a,d          ; 30
    ld d,a           ; 31

16bit * 16bit Unsigned -> 32 bits result

Input: BC = Multiplier, DE = Multiplicand

Output: DE,HL = Product

ld hl,0
ld a,16
muluw
 add hl,hl
 rl e
 rl d
 jr nc,muluw_cont
  add hl,bc
  jr nc,muluw_cont
   inc de
muluw_cont
 dec a
 jr nz,muluw

16bit * 16bit Unsigned -> 24 bits result

Input: BC = Multiplier, DE = Multiplicand

Output: A,HL = Product

ld ix,0
 ld hl,0
.mul24b1:
 ld a,c
 or b
 jr z,mul24b3
 srl b
 rr c
 jr nc,mul24b2
 add ix,de
 ld a,h
 adc l
 ld h,a
.mul24b2:
 sla e
 rl d
 rl l
 jr mul24b1:
.mul24b3:
 ld a,h
 db #dd:ld e,l
 db #dd:ld d,h
 ex de,hl
 ret

Web links

Multiplications and divisions on the MSX Assembly Page
"Multiplication on a Z80" ( MSX Computer & Club Webmagazine)
The Z80 number cruncher (Andre Adrian) (16/32bit multiplication, addition, right shift)

Difference between revisions of "Programming:Integer Multiplication"

Latest revision as of 08:13, 6 March 2025

Contents

Classic 8bit * 8bit Unsigned

Classic 16bit * 8bit Unsigned

Fast 8bit * 8bit Unsigned (using log / antilog tables)

Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables

Fast, accurate 8bit * 8bit Unsigned with 8 KB tables

Very fast 8bit * 8bit Unsigned with only 1K of tables

Fast 8bit * 8bit Unsigned with only 512 bytes of tables

Fast 8bit * 8bit Unsigned with only 512 bytes of tables (improved version)

16bit * 16bit Unsigned -> 32 bits result

16bit * 16bit Unsigned -> 24 bits result

Web links

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Contents

Hardware

Software

Printed matter

The Scene

Tools

@@ Line 175: / Line 175: @@
 </pre>
-== Faster, accurate 8bit * 8bit Unsigned ==
+== Fastest, accurate 8bit * 8bit Unsigned  with 16 KB tables ==
 '''Input:''' L = Multiplier, C = Multiplicand
@@ Line 294: / Line 294: @@
 </pre>
-K is a lot of memory to use for tables, but I'm working on a way to reduce this to 8K while maintaining similar performance (around 26&mu;s). The idea is rather than using tables for the low and high nibbles, use tables for alternate bits. So there would be 16 tables for the values #00, #01, #04, #05, #10, #11, #14, #15, #40, #41, #44, #45, #50, #51, #54, #55. The values can be shifted by 1 (rather than 4) to give values for the alternate bits using a simple ADD HL,HL.
+== Fast, accurate 8bit * 8bit Unsigned  with 8 KB tables ==
-This routine should be easy to adapt to a 16bit * 8bit unsigned multiplication or even a 16bit * 16bit, using simple additions.
+'''Input:''' L = Multiplier, C = Multiplicand
-[[User:Executioner|Executioner]] 03:58, 8 May 2007 (CEST)
+'''Output:''' DE = Product
 '''CPC Cycles:''' 100 = 25 usec
 '''Size:''' 19 bytes of code + 8 KB tables
+K is a lot of memory to use for tables, but I'm working on a way to reduce this to 8K while maintaining similar performance (around 26&mu;s). The idea is rather than using tables for the low and high nibbles, use tables for alternate bits. So there would be 16 tables for the values #00, #01, #04, #05, #10, #11, #14, #15, #40, #41, #44, #45, #50, #51, #54, #55. The values can be shifted by 1 (rather than 4) to give values for the alternate bits using a simple ADD HL,HL.
+This routine should be easy to adapt to a 16bit * 8bit unsigned multiplication or even a 16bit * 16bit, using simple additions.
+[[User:Executioner|Executioner]] 03:58, 8 May 2007 (CEST)
 Ok, here is the new 8K routine, first the routine to build the tables:
@@ Line 437: / Line 443: @@
 == Very fast 8bit * 8bit Unsigned with only 1K of tables ==
-'''Input''': B = Multiplier, C = Multiplicant
+'''Input:''' A = ''Multiplier'', L = ''Multiplicand''
-'''Output''': DE = Product
+'''Output:''' DE = ''Product''
-'''Clocks''': 104-112 (108 on average)
+'''CPC Cycles''': 104-112 (108 on average) = 26-28 (27) usec
 '''Size''': 24 bytes of code and 1024 bytes for the tables
@@ Line 490: / Line 496: @@
 Now for the actual multiply routine:
-'''Input:''' A = ''Multiplier'', L = ''Multiplicand''
-'''Output:''' DE = ''Product''
 <pre>
 	ld h,umul_tab_lo / #100	; 2
@@ Line 522: / Line 525: @@
 [[User:Executioner|Executioner]] 06:25, 4 April 2008 (CEST)
-== 16bit * 16bit Unsigned ==
+== Fast 8bit * 8bit Unsigned with only 512 bytes of tables ==
+'''Input''': A = Multiplier, B = Multiplicant
+'''Output''': A:E = Product
+'''CPC Cycles''': 136-172 (154 on average) = 34-43 (38.5) usec
+'''Size''': 44 bytes of code and 512 bytes for the tables
+Fast 8 bit unsigned multiplication with 16 bit result. It uses formula
+<br>x*y = ((x+y)/2)<sup>2</sup> - ((x-y)/2)<sup>2</sup>, if x+y is even
+<br>      = ((x+y-1)/2)<sup>2</sup> - ((x-y-1)/2)<sup>2</sup> + y, if x+y is odd and x>=y
+<pre>    cp b
+    jr nc,l1
+    ld e,a
+    ld a,b
+    ld b,e
+l1  ld c,a
+    sub b
+    rra
+    ld d,a
+    ld a,c
+    add a,b
+    rra
+    ld l,a
+    ld h,high(sqrlo)
+    ld a,(hl)
+    ld e,l
+    ld l,d
+    jr nc,l2
+    sub (hl)   ;odd
+    ld l,e
+    ld e,a
+    inc h      ;loads high(sqrhi)
+    ld a,(hl)
+    ld l,d
+    sbc a,(hl)
+    ld d,a
+    ld a,e
+    add a,b
+    ld e,a
+    ld a,d
+    adc a,0
+    ret
+l2  sub (hl)   ;even
+    ld l,e
+    ld e,a
+    inc h
+    ld a,(hl)
+    ld l,d
+    sbc a,(hl)
+    ret
+sqrlo ;low(x*x)   should be at the page border
+    db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1
+    db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1
+    db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1
+    db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81
+    db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61
+    db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41
+    db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21
+    db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1
+    db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1
+    db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1
+    db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1
+    db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81
+    db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61
+    db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41
+    db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21
+    db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1
+sqrhi ;high(x*x)
+    db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    db 1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3
+    db 4,4,4,4,5,5,5,5,6,6,6,7,7,7,8,8
+    db 9,9,9,$a,$a,$a,$b,$b,$c,$c,$d,$d,$e,$e,$f,$f
+    db $10,$10,$11,$11,$12,$12,$13,$13,$14,$14,$15,$15,$16,$17,$17,$18
+    db $19,$19,$1a,$1a,$1b,$1c,$1c,$1d,$1e,$1e,$1f,$20,$21,$21,$22,$23
+    db $24,$24,$25,$26,$27,$27,$28,$29,$2a,$2b,$2b,$2c,$2d,$2e,$2f,$30
+    db $31,$31,$32,$33,$34,$35,$36,$37,$38,$39,$3a,$3b,$3c,$3d,$3e,$3f
+    db $40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$4a,$4b,$4c,$4d,$4e,$4f
+    db $51,$52,$53,$54,$55,$56,$57,$59,$5a,$5b,$5c,$5d,$5f,$60,$61,$62
+    db $64,$65,$66,$67,$69,$6a,$6b,$6c,$6e,$6f,$70,$72,$73,$74,$76,$77
+    db $79,$7a,$7b,$7d,$7e,$7f,$81,$82,$84,$85,$87,$88,$8a,$8b,$8d,$8e
+    db $90,$91,$93,$94,$96,$97,$99,$9a,$9c,$9d,$9f,$a0,$a2,$a4,$a5,$a7
+    db $a9,$aa,$ac,$ad,$af,$b1,$b2,$b4,$b6,$b7,$b9,$bb,$bd,$be,$c0,$c2
+    db $c4,$c5,$c7,$c9,$cb,$cc,$ce,$d0,$d2,$d4,$d5,$d7,$d9,$db,$dd,$df
+    db $e1,$e2,$e4,$e6,$e8,$ea,$ec,$ee,$f0,$f2,$f4,$f6,$f8,$fa,$fc,$fe
+</pre>
+[[User:Litwr|Litwr]] 10:25, 24 October 2015 (CEST)
+== Fast 8bit * 8bit Unsigned with only 512 bytes of tables (improved version) ==
+'''Input''': A = Multiplier, L = Multiplicant
+'''Output:''' DE = ''Product''
+'''CPC Cycles''': 124 = 31 usec
+'''Size''': 27 bytes of code and 512 bytes for the tables
+'''by calc84maniac'''
+Hey, so someone brought up the wiki's 512-byte table routine in the context of Game Boy development, and I thought I'd take a crack at seeing if the speed and size could be made closer to the 1KB table routine. I actually had a good bit of success, by making the following observations:<br>
+* The x >= y constraint isn't necessary if negative values for floor((x - y) / 2) have their square calculated properly, so no need to swap inputs.<br>
+* Going from floor((x + y) / 2) to floor((x - y) / 2) simply requires a subtraction of y, so no need to divide by 2 twice or save the input value of x.<br>
+* To square a negative number z, it's sufficient to calculate z ^ 2 = (z + 256) ^ 2 - z * 512 - 65536, meaning a conditional offset of -(z * 512) in the subtracted square, or z * 512 in the final result.<br>
+* When conditionally adding y to the result when x + y is odd, the carry can be propagated into the low bit of the 8-bit register holding the upper half of z * 512.<br>
+* Both of these conditional adds can be turned into branchless adds of either the value or zero.<br>
+<br>
+This results in the following implementation (which I adjusted to take the same inputs and outputs as the 1KB routine for a fair comparison), which comes out to 27 bytes and a constant 31 usec:<br>
+<pre>
+; Input: A = x, L = y
+; Output: DE = x * y
+    ; C = floor((x + y) / 2)
+    add a,l          ; 1
+    rra              ; 2
+    ld c,a           ; 3
+    ; E = y if x + y is odd, else 0
+    sbc a,a          ; 4
+    and l            ; 5
+    ld e,a           ; 6
+    ; L = floor((x - y) / 2)
+    ld a,c           ; 7
+    sub l            ; 8
+    ld l,a           ; 9
+    ; D = L if negative, else 0
+    sbc a,a          ; 10
+    and l            ; 11
+    ld d,a           ; 12
+    ; Calculate low half of product
+    ld h,high(sqrlo) ; 14
+    ld b,h           ; 15
+    ld a,(bc)        ; 17
+    add a,e          ; 18
+    rl d             ; 20, multiplies D by 2 and propagates carry
+    sub (hl)         ; 22
+    ld e,a           ; 23
+    ; Calculate high half of product
+    inc h            ; 24, loads high(sqrhi)
+    ld b,h           ; 25
+    ld a,(bc)        ; 27
+    sbc a,(hl)       ; 29
+    add a,d          ; 30
+    ld d,a           ; 31
+</pre>
+== 16bit * 16bit Unsigned -> 32 bits result ==
+'''Input:''' BC = ''Multiplier'', DE = ''Multiplicand''
+'''Output:''' DE,HL = ''Product''
+<pre>ld hl,0
+ld a,16
+muluw
+ add hl,hl
+ rl e
+ rl d
+ jr nc,muluw_cont
+  add hl,bc
+  jr nc,muluw_cont
+   inc de
+muluw_cont
+ dec a
+ jr nz,muluw
+</pre>
+== 16bit * 16bit Unsigned -> 24 bits result ==
 '''Input:''' BC = ''Multiplier'', DE = ''Multiplicand''