Difference between revisions of "Programming:Integer Multiplication"
m (→Faster, accurate 8bit * 8bit Unsigned) |
m (→16bit * 16bit Unsigned -> 32 bits result: forgotten label) |
||
(9 intermediate revisions by 2 users not shown) | |||
Line 175: | Line 175: | ||
</pre> | </pre> | ||
− | == | + | == Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables == |
'''Input:''' L = Multiplier, C = Multiplicand | '''Input:''' L = Multiplier, C = Multiplicand | ||
Line 294: | Line 294: | ||
</pre> | </pre> | ||
− | + | == Fast, accurate 8bit * 8bit Unsigned with 8 KB tables == | |
− | + | '''Input:''' L = Multiplier, C = Multiplicand | |
− | + | '''Output:''' DE = Product | |
'''CPC Cycles:''' 100 = 25 usec | '''CPC Cycles:''' 100 = 25 usec | ||
'''Size:''' 19 bytes of code + 8 KB tables | '''Size:''' 19 bytes of code + 8 KB tables | ||
+ | |||
+ | 16K is a lot of memory to use for tables, but I'm working on a way to reduce this to 8K while maintaining similar performance (around 26μs). The idea is rather than using tables for the low and high nibbles, use tables for alternate bits. So there would be 16 tables for the values #00, #01, #04, #05, #10, #11, #14, #15, #40, #41, #44, #45, #50, #51, #54, #55. The values can be shifted by 1 (rather than 4) to give values for the alternate bits using a simple ADD HL,HL. | ||
+ | |||
+ | This routine should be easy to adapt to a 16bit * 8bit unsigned multiplication or even a 16bit * 16bit, using simple additions. | ||
+ | |||
+ | [[User:Executioner|Executioner]] 03:58, 8 May 2007 (CEST) | ||
Ok, here is the new 8K routine, first the routine to build the tables: | Ok, here is the new 8K routine, first the routine to build the tables: | ||
Line 437: | Line 443: | ||
== Very fast 8bit * 8bit Unsigned with only 1K of tables == | == Very fast 8bit * 8bit Unsigned with only 1K of tables == | ||
− | '''Input''' | + | '''Input:''' A = ''Multiplier'', L = ''Multiplicand'' |
− | '''Output''' | + | '''Output:''' DE = ''Product'' |
− | ''' | + | '''CPC Cycles''': 104-112 (108 on average) = 26-28 (27) usec |
'''Size''': 24 bytes of code and 1024 bytes for the tables | '''Size''': 24 bytes of code and 1024 bytes for the tables | ||
Line 490: | Line 496: | ||
Now for the actual multiply routine: | Now for the actual multiply routine: | ||
− | |||
− | |||
− | |||
<pre> | <pre> | ||
ld h,umul_tab_lo / #100 ; 2 | ld h,umul_tab_lo / #100 ; 2 | ||
Line 522: | Line 525: | ||
[[User:Executioner|Executioner]] 06:25, 4 April 2008 (CEST) | [[User:Executioner|Executioner]] 06:25, 4 April 2008 (CEST) | ||
− | == 16bit * 16bit Unsigned == | + | == Fast 8bit * 8bit Unsigned with only 512 bytes of tables == |
+ | |||
+ | '''Input''': A = Multiplier, B = Multiplicant | ||
+ | |||
+ | '''Output''': A:E = Product | ||
+ | |||
+ | '''CPC Cycles''': 136-172 (154 on average) = 34-43 (38.5) usec | ||
+ | |||
+ | '''Size''': 44 bytes of code and 512 bytes for the tables | ||
+ | |||
+ | Fast 8 bit unsigned multiplication with 16 bit result. It uses formula | ||
+ | <br>x*y = ((x+y)/2)<sup>2</sup> - ((x-y)/2)<sup>2</sup>, if x+y is even | ||
+ | <br> = ((x+y-1)/2)<sup>2</sup> - ((x-y-1)/2)<sup>2</sup> + y, if x+y is odd and x>=y | ||
+ | |||
+ | <pre> cp b | ||
+ | jr nc,l1 | ||
+ | |||
+ | ld e,a | ||
+ | ld a,b | ||
+ | ld b,e | ||
+ | |||
+ | l1 ld c,a | ||
+ | sub b | ||
+ | rra | ||
+ | ld d,a | ||
+ | ld a,c | ||
+ | add a,b | ||
+ | rra | ||
+ | ld l,a | ||
+ | ld h,high(sqrlo) | ||
+ | ld a,(hl) | ||
+ | ld e,l | ||
+ | ld l,d | ||
+ | jr nc,l2 | ||
+ | |||
+ | sub (hl) ;odd | ||
+ | ld l,e | ||
+ | ld e,a | ||
+ | inc h ;loads high(sqrhi) | ||
+ | ld a,(hl) | ||
+ | ld l,d | ||
+ | sbc a,(hl) | ||
+ | ld d,a | ||
+ | |||
+ | ld a,e | ||
+ | add a,b | ||
+ | ld e,a | ||
+ | ld a,d | ||
+ | adc a,0 | ||
+ | ret | ||
+ | |||
+ | l2 sub (hl) ;even | ||
+ | ld l,e | ||
+ | ld e,a | ||
+ | inc h | ||
+ | ld a,(hl) | ||
+ | ld l,d | ||
+ | sbc a,(hl) | ||
+ | ret | ||
+ | |||
+ | sqrlo ;low(x*x) should be at the page border | ||
+ | db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1 | ||
+ | db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1 | ||
+ | db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1 | ||
+ | db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81 | ||
+ | db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61 | ||
+ | db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41 | ||
+ | db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21 | ||
+ | db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1 | ||
+ | db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1 | ||
+ | db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1 | ||
+ | db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1 | ||
+ | db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81 | ||
+ | db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61 | ||
+ | db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41 | ||
+ | db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21 | ||
+ | db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1 | ||
+ | sqrhi ;high(x*x) | ||
+ | db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | ||
+ | db 1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3 | ||
+ | db 4,4,4,4,5,5,5,5,6,6,6,7,7,7,8,8 | ||
+ | db 9,9,9,$a,$a,$a,$b,$b,$c,$c,$d,$d,$e,$e,$f,$f | ||
+ | db $10,$10,$11,$11,$12,$12,$13,$13,$14,$14,$15,$15,$16,$17,$17,$18 | ||
+ | db $19,$19,$1a,$1a,$1b,$1c,$1c,$1d,$1e,$1e,$1f,$20,$21,$21,$22,$23 | ||
+ | db $24,$24,$25,$26,$27,$27,$28,$29,$2a,$2b,$2b,$2c,$2d,$2e,$2f,$30 | ||
+ | db $31,$31,$32,$33,$34,$35,$36,$37,$38,$39,$3a,$3b,$3c,$3d,$3e,$3f | ||
+ | db $40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$4a,$4b,$4c,$4d,$4e,$4f | ||
+ | db $51,$52,$53,$54,$55,$56,$57,$59,$5a,$5b,$5c,$5d,$5f,$60,$61,$62 | ||
+ | db $64,$65,$66,$67,$69,$6a,$6b,$6c,$6e,$6f,$70,$72,$73,$74,$76,$77 | ||
+ | db $79,$7a,$7b,$7d,$7e,$7f,$81,$82,$84,$85,$87,$88,$8a,$8b,$8d,$8e | ||
+ | db $90,$91,$93,$94,$96,$97,$99,$9a,$9c,$9d,$9f,$a0,$a2,$a4,$a5,$a7 | ||
+ | db $a9,$aa,$ac,$ad,$af,$b1,$b2,$b4,$b6,$b7,$b9,$bb,$bd,$be,$c0,$c2 | ||
+ | db $c4,$c5,$c7,$c9,$cb,$cc,$ce,$d0,$d2,$d4,$d5,$d7,$d9,$db,$dd,$df | ||
+ | db $e1,$e2,$e4,$e6,$e8,$ea,$ec,$ee,$f0,$f2,$f4,$f6,$f8,$fa,$fc,$fe | ||
+ | </pre> | ||
+ | |||
+ | [[User:Litwr|Litwr]] 10:25, 24 October 2015 (CEST) | ||
+ | |||
+ | == 16bit * 16bit Unsigned -> 32 bits result == | ||
+ | |||
+ | '''Input:''' BC = ''Multiplier'', DE = ''Multiplicand'' | ||
+ | |||
+ | '''Output:''' DE,HL = ''Product'' | ||
+ | <pre>ld hl,0 | ||
+ | ld a,16 | ||
+ | muluw | ||
+ | add hl,hl | ||
+ | rl e | ||
+ | rl d | ||
+ | jr nc,muluw_cont | ||
+ | add hl,bc | ||
+ | jr nc,muluw_cont | ||
+ | inc de | ||
+ | muluw_cont | ||
+ | dec a | ||
+ | jr nz,muluw | ||
+ | </pre> | ||
+ | |||
+ | == 16bit * 16bit Unsigned -> 24 bits result == | ||
'''Input:''' BC = ''Multiplier'', DE = ''Multiplicand'' | '''Input:''' BC = ''Multiplier'', DE = ''Multiplicand'' |
Latest revision as of 13:31, 1 March 2018
Contents
- 1 Classic 8bit * 8bit Unsigned
- 2 Classic 16bit * 8bit Unsigned
- 3 Fast 8bit * 8bit Unsigned (using log / antilog tables)
- 4 Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables
- 5 Fast, accurate 8bit * 8bit Unsigned with 8 KB tables
- 6 Very fast 8bit * 8bit Unsigned with only 1K of tables
- 7 Fast 8bit * 8bit Unsigned with only 512 bytes of tables
- 8 16bit * 16bit Unsigned -> 32 bits result
- 9 16bit * 16bit Unsigned -> 24 bits result
- 10 Web links
Classic 8bit * 8bit Unsigned
Input: H = Multiplier, E = Multiplicand, L = 0, D = 0
Output: HL = Product
CPC Cycles: 188-244 (216 on average) = 47-61 (54) usec
Size: 33 bytes
The additional clocks and bytes maybe required to set up D and L and for RET.
sla h ; optimised 1st iteration jr nc,$+3 ld l,e add hl,hl ; unroll 7 times jr nc,$+3 ; ... add hl,de ; ...
Classic 16bit * 8bit Unsigned
Input: A = Multiplier, DE = Multiplicand, HL = 0, C = 0
Output: A:HL = Product
CPC Cycles: 212-300 (256 on average) = 53-75 (64) usec
Size: 47 bytes
add a,a ; optimised 1st iteration jr nc,$+4 ld h,d ld l,e add hl,hl ; unroll 7 times rla ; ... jr nc,$+4 ; ... add hl,de ; ... adc a,c ; ...
Fast 8bit * 8bit Unsigned (using log / antilog tables)
The original routine was written by Jeff Frohwein for the Nintendo Gameboy. You can find it on Devrs.com.
Because of the usage of log / antilog tables this routine is less accurate, but very fast. It takes advantage of the fact that if you take the log of two numbers, add the results and then take the antilog of the total you have done the equivalent of multiplying the two numbers:
x^a * x^b = x^(a+b) a * b = x^(logx(a) + logx(b))
Input: B = Multiplier, C = Multiplicant
Output: DE = Product
CPC Cycles: 124 = 31 usec
Size: 1024 bytes (768 bytes for the tables)
Warning: It is VERY approximate routine, e.g., 0*7=7, 1*28=27, 10*81=790, 100*100=9742, ...
FastMult: ld l,c ld h,&82 ld d,(hl) ; d = 32 * log_2(c) ld l,b ld a,(hl) ; a = 32 * log_2(b) add a,d ld l,a ld a,0 adc a,0 ld h,a ; hl = d + a add hl,hl set 2,h ; hl = hl + $0400 set 7,h ; hl = hl + &8000 ld e,(hl) inc hl ld d,(hl) ; de = 2^((hl)/32) ret ; 32*Log_2(x) Table ; ; FOR A=0 TO 255 ; C=4 ; B=2 ; FOR Z=1 TO 10 ; IF (2^C) > A THEN C=C-B ELSE C=C+B ; B=B/2 ; NEXT Z ; PRINT INT(C*32);","; ; NEXT A ORG &8200 logtable: db 0 , 0 , 32 , 50 , 64 , 74 , 82 , 89 , 96 , 101 , 106 , 110 , 114 , 118 , 121 db 125 , 128 , 130 , 133 , 135 , 138 , 140 , 142 , 144 , 146 , 148 , 150 , 152 db 153 , 155 , 157 , 158 , 160 , 161 , 162 , 164 , 165 , 166 , 167 , 169 , 170 db 171 , 172 , 173 , 174 , 175 , 176 , 177 , 178 , 179 , 180 , 181 , 182 , 183 db 184 , 185 , 185 , 186 , 187 , 188 , 189 , 189 , 190 , 191 , 192 , 192 , 193 db 194 , 194 , 195 , 196 , 196 , 197 , 198 , 198 , 199 , 199 , 200 , 201 , 201 db 202 , 202 , 203 , 204 , 204 , 205 , 205 , 206 , 206 , 207 , 207 , 208 , 208 db 209 , 209 , 210 , 210 , 211 , 211 , 212 , 212 , 213 , 213 , 213 , 214 , 214 db 215 , 215 , 216 , 216 , 217 , 217 , 217 , 218 , 218 , 219 , 219 , 219 , 220 db 220 , 221 , 221 , 221 , 222 , 222 , 222 , 223 , 223 , 224 , 224 , 224 , 225 db 225 , 225 , 226 , 226 , 226 , 227 , 227 , 227 , 228 , 228 , 228 , 229 , 229 db 229 , 230 , 230 , 230 , 231 , 231 , 231 , 231 , 232 , 232 , 232 , 233 , 233 db 233 , 234 , 234 , 234 , 234 , 235 , 235 , 235 , 236 , 236 , 236 , 236 , 237 db 237 , 237 , 237 , 238 , 238 , 238 , 238 , 239 , 239 , 239 , 239 , 240 , 240 db 240 , 241 , 241 , 241 , 241 , 241 , 242 , 242 , 242 , 242 , 243 , 243 , 243 db 243 , 244 , 244 , 244 , 244 , 245 , 245 , 245 , 245 , 245 , 246 , 246 , 246 db 246 , 247 , 247 , 247 , 247 , 247 , 248 , 248 , 248 , 248 , 249 , 249 , 249 db 249 , 249 , 250 , 250 , 250 , 250 , 250 , 251 , 251 , 251 , 251 , 251 , 252 db 252 , 252 , 252 , 252 , 253 , 253 , 253 , 253 , 253 , 253 , 254 , 254 , 254 db 254 , 254 , 255 , 255 , 255 , 255 , 255 ; AntiLog 2^(x/32) Table ; ; FOR A=0 to 510 ; PRINT INT(2^(A/32)+.5);","; ; NEXT A ORG &8400 antilog: dw 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 2 dw 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 dw 2 , 2 , 2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4 dw 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 6 dw 6 , 6 , 6 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 8 , 8 , 8 , 8 , 8 , 9 dw 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 , 12 , 12 , 12 , 12 dw 13 , 13 , 13 , 13 , 14 , 14 , 14 , 15 , 15 , 15 , 16 , 16 , 16 , 17 , 17 , 17 dw 18 , 18 , 19 , 19 , 19 , 20 , 20 , 21 , 21 , 22 , 22 , 23 , 23 , 24 , 24 , 25 dw 25 , 26 , 26 , 27 , 27 , 28 , 29 , 29 , 30 , 31 , 31 , 32 , 33 , 33 , 34 , 35 dw 36 , 36 , 37 , 38 , 39 , 40 , 41 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 dw 50 , 52 , 53 , 54 , 55 , 56 , 57 , 59 , 60 , 61 , 63 , 64 , 65 , 67 , 68 , 70 dw 71 , 73 , 74 , 76 , 78 , 79 , 81 , 83 , 85 , 87 , 89 , 91 , 92 , 95 , 97 , 99 dw 101 , 103 , 105 , 108 , 110 , 112 , 115 , 117 , 120 , 123 , 125 , 128 , 131 dw 134 , 137 , 140 , 143 , 146 , 149 , 152 , 156 , 159 , 162 , 166 , 170 , 173 dw 177 , 181 , 185 , 189 , 193 , 197 , 202 , 206 , 211 , 215 , 220 , 225 , 230 dw 235 , 240 , 245 , 251 , 256 , 262 , 267 , 273 , 279 , 285 , 292 , 298 , 304 dw 311 , 318 , 325 , 332 , 339 , 347 , 354 , 362 , 370 , 378 , 386 , 395 , 403 dw 412 , 421 , 431 , 440 , 450 , 459 , 470 , 480 , 490 , 501 , 512 , 523 , 535 dw 546 , 558 , 571 , 583 , 596 , 609 , 622 , 636 , 650 , 664 , 679 , 693 , 709 dw 724 , 740 , 756 , 773 , 790 , 807 , 825 , 843 , 861 , 880 , 899 , 919 , 939 dw 960 , 981 , 1002 , 1024 , 1046 , 1069 , 1093 , 1117 , 1141 , 1166 , 1192 dw 1218 , 1244 , 1272 , 1300 , 1328 , 1357 , 1387 , 1417 , 1448 , 1480 , 1512 dw 1545 , 1579 , 1614 , 1649 , 1685 , 1722 , 1760 , 1798 , 1838 , 1878 , 1919 dw 1961 , 2004 , 2048 , 2093 , 2139 , 2186 , 2233 , 2282 , 2332 , 2383 , 2435 dw 2489 , 2543 , 2599 , 2656 , 2714 , 2774 , 2834 , 2896 , 2960 , 3025 , 3091 dw 3158 , 3228 , 3298 , 3371 , 3444 , 3520 , 3597 , 3676 , 3756 , 3838 , 3922 dw 4008 , 4096 , 4186 , 4277 , 4371 , 4467 , 4565 , 4664 , 4767 , 4871 , 4978 dw 5087 , 5198 , 5312 , 5428 , 5547 , 5668 , 5793 , 5919 , 6049 , 6182 , 6317 dw 6455 , 6597 , 6741 , 6889 , 7039 , 7194 , 7351 , 7512 , 7677 , 7845 , 8016 dw 8192 , 8371 , 8555 , 8742 , 8933 , 9129 , 9329 , 9533 , 9742 , 9955 , 10173 dw 10396 , 10624 , 10856 , 11094 , 11337 , 11585 , 11839 , 12098 , 12363 , 12634 dw 12910 , 13193 , 13482 , 13777 , 14079 , 14387 , 14702 , 15024 , 15353 , 15689 dw 16033 , 16384 , 16743 , 17109 , 17484 , 17867 , 18258 , 18658 , 19066 , 19484 dw 19911 , 20347 , 20792 , 21247 , 21713 , 22188 , 22674 , 23170 , 23678 , 24196 dw 24726 , 25268 , 25821 , 26386 , 26964 , 27554 , 28158 , 28774 , 29404 , 30048 dw 30706 , 31379 , 32066 , 32768 , 33485 , 34219 , 34968 , 35734 , 36516 , 37316 dw 38133 , 38968 , 39821 , 40693 , 41584 , 42495 , 43425 , 44376 , 45348 , 46341 dw 47356 , 48393 , 49452 , 50535 , 51642 , 52772 , 53928 , 55109 , 56316 , 57549 dw 58809 , 60097 , 61413 , 62757
Fastest, accurate 8bit * 8bit Unsigned with 16 KB tables
Input: L = Multiplier, C = Multiplicand
Output: DE = Product
CPC Cycles: 80 = 20 usec
Size: 14 bytes of code + 16 KB tables
I'm currently working on a new routine, based on a routine by Kirk Meyer [1] which uses nibble multiplication tables.
I have a working, tested version which uses 16K of tables and can perform multiplication in 20μs.
The code to produce the tables is below:
.maketabs ld hl,hltab1 .makelp1 ld a,l rla and #1e add restab / 256 ld (hl),a inc l jr nz,makelp1 inc h .makelp2 ld a,l rra:rra:rra and #1e jr z,usez add restab2 - restab / 256 - 2 .usez add restab / 256 ld (hl),a inc l jr nz,makelp2 inc h ; restab .makelp3 ld a,(hl) inc h ld d,(hl) inc h add l ld (hl),a inc h ld a,0 adc d ld (hl),a dec h:dec h:dec h inc l jr nz,makelp3 inc h inc h ld a,h cp restab2 / 256 - 2 jr nz,makelp3 ld b,h ld c,l inc b inc b ld h,restab / 256 + 2 .makelp4 ld e,(hl) inc h ld d,(hl) dec h ex de,hl add hl,hl:add hl,hl:add hl,hl:add hl,hl ex de,hl ld a,e ld (bc),a inc b ld a,d ld (bc),a dec b inc l inc c jr nz,makelp4 inc h inc h inc b inc b ld a,h cp restab2 / 256 jr nz,makelp4 ret ds -$ and #ff .hltab1 ds 256 .hltab2 ds 256 .restab ds 512 * 16 .restab2 ds 512 * 15
The code to perform the multiplication (DE = L * C):
ld h,hltab1 / 256 ; 2 ld b,(hl) ; 4 inc h ; 5 ld h,(hl) ; 7 ld l,c ; 8 ld a,(bc) ; 10 add (hl) ; 12 ld e,a ; 13 inc b ; 14 inc h ; 15 ld a,(bc) ; 17 adc (hl) ; 19 ld d,a ; 20
Fast, accurate 8bit * 8bit Unsigned with 8 KB tables
Input: L = Multiplier, C = Multiplicand
Output: DE = Product
CPC Cycles: 100 = 25 usec
Size: 19 bytes of code + 8 KB tables
16K is a lot of memory to use for tables, but I'm working on a way to reduce this to 8K while maintaining similar performance (around 26μs). The idea is rather than using tables for the low and high nibbles, use tables for alternate bits. So there would be 16 tables for the values #00, #01, #04, #05, #10, #11, #14, #15, #40, #41, #44, #45, #50, #51, #54, #55. The values can be shifted by 1 (rather than 4) to give values for the alternate bits using a simple ADD HL,HL.
This routine should be easy to adapt to a 16bit * 8bit unsigned multiplication or even a 16bit * 16bit, using simple additions.
Executioner 03:58, 8 May 2007 (CEST)
Ok, here is the new 8K routine, first the routine to build the tables:
.maketabs ld hl,hltab1 .makelp5 ld a,l ld bc,0 rla:rl b:rla:rl c rla:rl b:rla:rl c rla:rl b:rla:rl c rla:rl b:rla:rl c ld a,b add a add restab / 256 ld (hl),a inc h ld a,c add a add restab / 256 ld (hl),a dec h inc l jr nz,makelp5 ld h,restab / 256 + 2 .makelp6 ld (hl),l inc l jr nz,makelp6 inc h:inc h ld b,2 .makelp8 push bc xor a rr b:rra:rrca rr b:rra:rrca rr b:rra:rrca rr b:rra:rrca .makelp7 push hl push af call mulLbyA ex de,hl pop af pop hl ld (hl),e inc h ld (hl),d dec h inc l jr nz,makelp7 inc h:inc h pop bc inc b bit 4,b jr z,makelp8 ret .mulLbyA ; MAX times ; MIN times ld e,l ; 1 ; 1 ld d,0 ; 3 ; 3 add a ; 4 ; 4 ld h,a ; 5 ; 5 jr c,ncad0 ; 8 ; 8 ld l,d .ncad0 add hl,hl ; 11 ; 11 jr nc,ncad1 ; 13 ; 14 add hl,de ; 16 .ncad1 add hl,hl ; 19 ; 17 jr nc,ncad2 ; 21 ; 20 add hl,de ; 24 .ncad2 add hl,hl ; 27 ; 23 jr nc,ncad3 ; 29 ; 26 add hl,de ; 32 .ncad3 add hl,hl ; 35 ; 29 jr nc,ncad4 ; 37 ; 32 add hl,de ; 40 .ncad4 add hl,hl ; 43 ; 35 jr nc,ncad5 ; 45 ; 38 add hl,de ; 48 .ncad5 add hl,hl ; 51 ; 41 jr nc,ncad6 ; 53 ; 44 add hl,de ; 56 .ncad6 add hl,hl ; 59 ; 47 ld a,h ; 60 ; 48 ret nc ; 61 ; 50? add hl,de ; 64 ld a,h ; 65 ret ds -$ and #ff .hltab1 ds 256 .hltab2 ds 256 .restab ds 512 * 16
And the routine to do the multiplication:
.umultCL ; Using 8 bit only (25us - DE = L * C) ld h,hltab2 / 256 ; 2 ld b,(hl) ; 4 dec h ; 5 ld h,(hl) ; 7 ld l,c ; 8 ld a,(hl) ; 10 add a ; 11 inc h ; 12 ld d,(hl) ; 14 rl d ; 16 ld h,b ; 17 add (hl) ; 19 ld e,a ; 20 inc h ; 21 ld a,(hl) ; 23 adc d ; 24 ld d,a ; 25 ret
Very fast 8bit * 8bit Unsigned with only 1K of tables
Input: A = Multiplier, L = Multiplicand
Output: DE = Product
CPC Cycles: 104-112 (108 on average) = 26-28 (27) usec
Size: 24 bytes of code and 1024 bytes for the tables
Here's a new routine I've developed which uses the formula ab = ((a + b)2 - (a - b)2) / 4. It's based on a routine for the 6502 by Stephen Judd in a C= Hacking article. Because of differences between the way the 6502 does register indexing it was quite difficult to actually get this working, but it's a great compromise between speed and space since it only uses 1K of tables (as opposed to the 16K or 8K table routines above), and can still manage to do the job in a maximum of 28 microseconds.
Firstly, once again, we need some code to generate the tables. These tables contain values for
x2/4 for 9 bit values of x, with the LSB when bit 8 is zero first followed by the MSB.
.gen_sq4 xor a ld de,umul_tab + #1ff ld (de),a dec d ld (de),a ld h,d ld l,e inc e ld c,e ld b,2 .sq4_lp ld a,b cp 2 ld a,e rra add (hl) ld (de),a inc h inc d ld a,(hl) adc c ld (de),a dec d ld h,d inc l inc e jr nz,sq4_lp inc d inc d djnz sq4_lp ret align #100 .umul_tab ds #400
Now for the actual multiply routine:
ld h,umul_tab_lo / #100 ; 2 ld b,h ; 3 add l ; 4 ld c,a ; 5 jr nc,@noovf ; 7 inc b ; 8 inc b ; 9 .@noovf sub l ; 10 sub l ; 11 jr nc,@noneg ; 13 neg ; 15 .@noneg ld l,a ; 16 ld a,(bc) ; 18 sub (hl) ; 20 ld e,a ; 21 inc b ; 22 inc h ; 23 ld a,(bc) ; 25 sbc (hl) ; 27 ld d,a ; 28
This code could easily be converted to a macro as it's only 24 bytes. I've tried to optimize it further but with no luck!
Executioner 06:25, 4 April 2008 (CEST)
Fast 8bit * 8bit Unsigned with only 512 bytes of tables
Input: A = Multiplier, B = Multiplicant
Output: A:E = Product
CPC Cycles: 136-172 (154 on average) = 34-43 (38.5) usec
Size: 44 bytes of code and 512 bytes for the tables
Fast 8 bit unsigned multiplication with 16 bit result. It uses formula
x*y = ((x+y)/2)2 - ((x-y)/2)2, if x+y is even
= ((x+y-1)/2)2 - ((x-y-1)/2)2 + y, if x+y is odd and x>=y
cp b jr nc,l1 ld e,a ld a,b ld b,e l1 ld c,a sub b rra ld d,a ld a,c add a,b rra ld l,a ld h,high(sqrlo) ld a,(hl) ld e,l ld l,d jr nc,l2 sub (hl) ;odd ld l,e ld e,a inc h ;loads high(sqrhi) ld a,(hl) ld l,d sbc a,(hl) ld d,a ld a,e add a,b ld e,a ld a,d adc a,0 ret l2 sub (hl) ;even ld l,e ld e,a inc h ld a,(hl) ld l,d sbc a,(hl) ret sqrlo ;low(x*x) should be at the page border db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1 db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1 db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1 db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81 db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61 db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41 db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21 db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1 db 0,1,4,9,$10,$19,$24,$31,$40,$51,$64,$79,$90,$a9,$c4,$e1 db 0,$21,$44,$69,$90,$b9,$e4,$11,$40,$71,$a4,$d9,$10,$49,$84,$c1 db 0,$41,$84,$c9,$10,$59,$a4,$f1,$40,$91,$e4,$39,$90,$e9,$44,$a1 db 0,$61,$c4,$29,$90,$f9,$64,$d1,$40,$b1,$24,$99,$10,$89,4,$81 db 0,$81,4,$89,$10,$99,$24,$b1,$40,$d1,$64,$f9,$90,$29,$c4,$61 db 0,$a1,$44,$e9,$90,$39,$e4,$91,$40,$f1,$a4,$59,$10,$c9,$84,$41 db 0,$c1,$84,$49,$10,$d9,$a4,$71,$40,$11,$e4,$b9,$90,$69,$44,$21 db 0,$e1,$c4,$a9,$90,$79,$64,$51,$40,$31,$24,$19,$10,9,4,$1 sqrhi ;high(x*x) db 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 db 1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3 db 4,4,4,4,5,5,5,5,6,6,6,7,7,7,8,8 db 9,9,9,$a,$a,$a,$b,$b,$c,$c,$d,$d,$e,$e,$f,$f db $10,$10,$11,$11,$12,$12,$13,$13,$14,$14,$15,$15,$16,$17,$17,$18 db $19,$19,$1a,$1a,$1b,$1c,$1c,$1d,$1e,$1e,$1f,$20,$21,$21,$22,$23 db $24,$24,$25,$26,$27,$27,$28,$29,$2a,$2b,$2b,$2c,$2d,$2e,$2f,$30 db $31,$31,$32,$33,$34,$35,$36,$37,$38,$39,$3a,$3b,$3c,$3d,$3e,$3f db $40,$41,$42,$43,$44,$45,$46,$47,$48,$49,$4a,$4b,$4c,$4d,$4e,$4f db $51,$52,$53,$54,$55,$56,$57,$59,$5a,$5b,$5c,$5d,$5f,$60,$61,$62 db $64,$65,$66,$67,$69,$6a,$6b,$6c,$6e,$6f,$70,$72,$73,$74,$76,$77 db $79,$7a,$7b,$7d,$7e,$7f,$81,$82,$84,$85,$87,$88,$8a,$8b,$8d,$8e db $90,$91,$93,$94,$96,$97,$99,$9a,$9c,$9d,$9f,$a0,$a2,$a4,$a5,$a7 db $a9,$aa,$ac,$ad,$af,$b1,$b2,$b4,$b6,$b7,$b9,$bb,$bd,$be,$c0,$c2 db $c4,$c5,$c7,$c9,$cb,$cc,$ce,$d0,$d2,$d4,$d5,$d7,$d9,$db,$dd,$df db $e1,$e2,$e4,$e6,$e8,$ea,$ec,$ee,$f0,$f2,$f4,$f6,$f8,$fa,$fc,$fe
Litwr 10:25, 24 October 2015 (CEST)
16bit * 16bit Unsigned -> 32 bits result
Input: BC = Multiplier, DE = Multiplicand
Output: DE,HL = Product
ld hl,0 ld a,16 muluw add hl,hl rl e rl d jr nc,muluw_cont add hl,bc jr nc,muluw_cont inc de muluw_cont dec a jr nz,muluw
16bit * 16bit Unsigned -> 24 bits result
Input: BC = Multiplier, DE = Multiplicand
Output: A,HL = Product
ld ix,0 ld hl,0 .mul24b1: ld a,c or b jr z,mul24b3 srl b rr c jr nc,mul24b2 add ix,de ld a,h adc l ld h,a .mul24b2: sla e rl d rl l jr mul24b1: .mul24b3: ld a,h db #dd:ld e,l db #dd:ld d,h ex de,hl ret
Web links
- Multiplications and divisions on the MSX Assembly Page
- "Multiplication on a Z80" ( MSX Computer & Club Webmagazine)
- The Z80 number cruncher (Andre Adrian) (16/32bit multiplication, addition, right shift)