Skip to content

Commit 471a8dd

Browse files
committed
optimized llmulhu with exx
1 parent aa7f1e6 commit 471a8dd

1 file changed

Lines changed: 106 additions & 93 deletions

File tree

src/crt/llmulhu.src

Lines changed: 106 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -7,105 +7,118 @@
77

88
; BC:UDE:UHL = ((uint128_t)BC:UDE:UHL * (uint128_t)(SP64)) >> 64
99
__llmulhu:
10+
; modified version of __llmulu that uses exx to obtain the upper 64 bits of the result.
11+
; __llmulhu runs slightly faster than two calls to __llmulu, and is much faster
12+
; than the naive implementation of __llmulhu that calls __llmulu four times.
13+
push af
14+
ld a, i
15+
di
16+
push af
17+
1018
push ix
1119
push iy
12-
ld ix, -36
13-
add ix, sp
14-
ld sp, ix
15-
lea ix, ix + 36
16-
17-
ld (ix - 3), bc
18-
ld (ix - 6), de
19-
ld (ix - 9), hl
20-
21-
ld bc, 0
22-
ld (ix - 10), b
23-
ld (ix - 13), bc
24-
ld (ix - 30), bc
25-
ld c, (ix + 12)
26-
ld (ix - 33), bc
27-
ld iy, (ix + 9)
28-
ld (ix - 36), iy
29-
30-
; x_lo * y_lo
31-
ld c, b
32-
ld d, b
33-
inc de
34-
dec.s de
35-
call __llmulu
36-
inc bc
37-
dec.s bc
38-
ld (ix - 16), bc
39-
ld (ix - 19), de
40-
ld b, 0
41-
ld c, b
42-
43-
; x_hi * y_lo
44-
inc.s de
45-
ld d, b
46-
ld e, (ix - 2)
47-
ld hl, (ix - 5)
48-
call __llmulu
49-
inc bc
50-
dec.s bc
51-
ld (ix - 21), bc
52-
ld (ix - 24), de
53-
ld (ix - 27), hl
54-
55-
ld c, (ix + 16)
56-
ld (ix - 33), c
57-
ld iy, (ix + 13)
58-
ld (ix - 36), iy
59-
60-
; x_lo * y_hi
61-
ld b, 0
62-
ld c, b
63-
inc.s de
64-
ld d, b
65-
ld e, (ix - 6)
66-
ld hl, (ix - 9)
67-
call __llmulu
68-
inc bc
69-
dec.s bc
70-
lea iy, ix - 27
71-
call .L.__llmulhu_i72add
72-
lea iy, ix - 18
73-
call .L.__llmulhu_i72add
74-
ld (ix - 16), bc
75-
ld (ix - 19), de
76-
ld bc, 0
77-
78-
; x_hi * y_hi
79-
inc.s de
80-
ld d, b
81-
ld e, (ix - 2)
82-
ld hl, (ix - 5)
83-
call __llmulu
84-
inc bc
85-
dec.s bc
86-
lea iy, ix - 18
87-
call .L.__llmulhu_i72add
88-
ld sp, ix
89-
pop iy
90-
pop ix
91-
ret
9220

93-
.L.__llmulhu_i72add:
94-
; similar to __lladd, except iy points to the stack and is destroyed
95-
push bc
96-
ld bc, (iy + 0)
97-
add hl, bc
21+
ld ix, 0
22+
lea iy, ix - 6
23+
add iy, sp ; cf=1
24+
25+
push de
26+
push hl
27+
ld l, c
28+
ld h, b
29+
ld.s sp, hl
30+
31+
lea hl, iy + 21
32+
ld b, 8
33+
.L.push_loop:
34+
push af
35+
ld a, (hl)
36+
inc hl
37+
or a, a ; cf=0
38+
djnz .L.push_loop
39+
40+
sbc hl, hl
41+
ld e, l
42+
ld d, h
43+
44+
exx
45+
sbc hl, hl
46+
ex de, hl
47+
sbc hl, hl
48+
ld c, l
49+
ld b, l
50+
exx
51+
52+
.L.byte_loop:
53+
scf
54+
adc a, a
55+
56+
.L.bit_loop:
57+
ex af, af'
58+
59+
add ix, ix
60+
adc hl, hl
9861
ex de, hl
62+
adc.s hl, hl
63+
ex de, hl
64+
65+
exx
66+
adc hl, hl
67+
ex de, hl
68+
adc hl, hl
69+
ex de, hl
70+
rl c
71+
rl b
72+
exx
73+
74+
ex af, af'
75+
76+
jr nc, .L.add_end
77+
ld bc, (iy)
78+
add ix, bc
9979
ld bc, (iy + 3)
10080
adc hl, bc
10181
ex de, hl
102-
pop bc
103-
jr nc, .L.no_carry48
82+
adc.s hl, sp
83+
ex de, hl
84+
jr nc, .L.add_end
85+
exx
86+
inc hl
87+
add hl, de
88+
or a, a
89+
sbc hl, de
90+
jr nz, .L.add_end_exx
91+
inc de
92+
sbc hl, de
93+
add hl, de
94+
jr nz, .L.add_end_exx
10495
inc bc
105-
.L.no_carry48:
106-
ld iy, (iy + 6)
107-
add iy, bc
108-
lea bc, iy + 0
109-
ret
96+
.L.add_end_exx:
97+
exx
98+
.L.add_end:
99+
100+
add a, a
101+
jr nz, .L.bit_loop
102+
103+
pop af
104+
jr nc, .L.byte_loop
105+
106+
; ld b, d
107+
; ld c, e
108+
; ex de, hl
109+
; lea hl, ix + 0
110+
; BC:UDE:UHL = lower 64 bits
111+
; shadow BC:UDE:UHL = upper 64 bits
112+
exx
110113

111-
.extern __llmulu
114+
pop af ; reset SP
115+
pop af ; reset SP
116+
pop iy
117+
pop ix
118+
119+
pop af
120+
jp po, .L.skipEI
121+
ei
122+
.L.skipEI:
123+
pop af
124+
ret

0 commit comments

Comments
 (0)