blob: ba426628123727b52757274d6f9d907953cc848f [file] [log] [blame]
Austin Schuh208337d2022-01-01 14:29:11 -08001/*
2 * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7#include "hardware/regs/addressmap.h"
8#include "hardware/divider_helper.S"
9
10.syntax unified
11.cpu cortex-m0plus
12.thumb
13
14// PICO_CONFIG: PICO_DIVIDER_DISABLE_INTERRUPTS, Disable interrupts around division such that divider state need not be saved/restored in exception handlers, default=0, group=pico_divider
15
16#include "pico/asm_helper.S"
17
18// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 32 bit division by zero should call __aeabi_idiv0, default=1, group=pico_divider
19#ifndef PICO_DIVIDER_CALL_IDIV0
20#define PICO_DIVIDER_CALL_IDIV0 1
21#endif
22
23// PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 64 bit division by zero should call __aeabi_ldiv0, default=1, group=pico_divider
24#ifndef PICO_DIVIDER_CALL_LDIV0
25#define PICO_DIVIDER_CALL_LDIV0 1
26#endif
27
28// PICO_CONFIG: PICO_DIVIDER_IN_RAM, Whether divider functions should be placed in RAM, default=0, group=pico_divider
29.macro div_section name
30#if PICO_DIVIDER_IN_RAM
31.section RAM_SECTION_NAME(\name), "ax"
32#else
33.section SECTION_NAME(\name), "ax"
34#endif
35.endm
36
37@ wait 8-n cycles for the hardware divider
38.macro wait_div n
39.rept (8-\n) / 2
40 b 9f
419:
42.endr
43.if (8-\n) % 2
44 nop
45.endif
46.endm
47
48#if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4)
49#error register layout has changed - we rely on this order to make sure we save/restore in the right order
50#endif
51
52#if !PICO_DIVIDER_DISABLE_INTERRUPTS
53.macro save_div_state_and_lr_64
54 push {r4, r5, r6, r7, lr}
55 ldr r6, =SIO_BASE
561:
57 ldr r5, [r6, #SIO_DIV_CSR_OFFSET]
58 // wait for results as we can't save signed-ness of operation
59 lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY
60 bcc 1b
61 // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia!
62 ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET]
63 ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET]
64 ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET]
65 ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET]
66.endm
67
68.macro restore_div_state_and_return_64
69 // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order
70 //
71 // it is worth considering what happens if we are interrupted
72 //
73 // after writing r4: we are DIRTY and !READY
74 // ... interruptor using div will complete based on incorrect inputs, but dividend at least will be
75 // saved/restored correctly and we'll restore the rest ourselves
76 // after writing r4, r5: we are DIRTY and !READY
77 // ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor
78 // at least will be saved/restored correctly and and we'll restore the rest ourselves
79 // after writing r4, r5, r6: we are DIRTY and READY
80 // ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves),
81 // and we'll restore the remainder after the fact
82
83 mov ip, r2
84 ldr r2, =SIO_BASE
85 // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space
86 // and so 4 reads is cheaper (and we don't have to adjust r2)
87 str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
88 str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET]
89 str r7, [r2, #SIO_DIV_REMAINDER_OFFSET]
90 str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET]
91 mov r2, ip
92 pop {r4, r5, r6, r7, pc}
93.endm
94
95#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
96
97// since idiv and idivmod only differ by a cycle, we'll make them the same!
98div_section WRAPPER_FUNC_NAME(__aeabi_idiv)
99.align 2
100wrapper_func __aeabi_idiv
101wrapper_func __aeabi_idivmod
102regular_func div_s32s32
103regular_func divmod_s32s32
104#if !PICO_DIVIDER_DISABLE_INTERRUPTS
105 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
Ravago Jonesd208ae72023-02-13 02:24:07 -0800106 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800107 ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
108 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
109 bcs divmod_s32s32_savestate
110regular_func divmod_s32s32_unsafe
111#else
112// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
113// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
114// are the hardware_divider functions that can be used instead anyway
115regular_func divmod_s32s32_unsafe
116 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
Ravago Jonesd208ae72023-02-13 02:24:07 -0800117 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800118 mrs r3, PRIMASK
119 cpsid i
120#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
121 str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET]
122 str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET]
123 cmp r1, #0
124 beq 1f
125 wait_div 2
126 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
127 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
128 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
129#if PICO_DIVIDER_DISABLE_INTERRUPTS
130 msr PRIMASK, r3
131#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
132 bx lr
1331:
134#if PICO_DIVIDER_DISABLE_INTERRUPTS
135 msr PRIMASK, r3
136#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
137 push {r2, lr}
138 movs r1, #0x80
139 lsls r1, #24
140 asrs r2, r0, #31
141 eors r1, r2
142 cmp r0, #0
143 beq 1f
144 mvns r0, r1
1451:
146#if PICO_DIVIDER_CALL_IDIV0
147 bl __aeabi_idiv0
148#endif
149 movs r1, #0 // remainder 0
150 // need to restore saved r2 as it hold SIO ptr
151 pop {r2, pc}
152#if !PICO_DIVIDER_DISABLE_INTERRUPTS
153.align 2
154regular_func divmod_s32s32_savestate
155 // note that we must be at least 2 cycles into division at this point,
156 // which we are because of the firty check before getting here (and of course the function call before that)
157 save_div_state_and_lr
158 bl divmod_s32s32_unsafe
159 restore_div_state_and_return
160#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
161
162// since uidiv and uidivmod only differ by a cycle, we'll make them the same!
163div_section WRAPPER_FUNC_NAME(__aeabi_uidiv)
164regular_func div_u32u32
165regular_func divmod_u32u32
166wrapper_func __aeabi_uidiv
167wrapper_func __aeabi_uidivmod
168#if !PICO_DIVIDER_DISABLE_INTERRUPTS
169 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
Ravago Jonesd208ae72023-02-13 02:24:07 -0800170 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800171 ldr r3, [r2, #SIO_DIV_CSR_OFFSET]
172 lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
173 bcs divmod_u32u32_savestate
174regular_func divmod_u32u32_unsafe
175#else
176// to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs
177// in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there
178// are the hardware_divider functions that can be used instead anyway
179regular_func divmod_u32u32_unsafe
180 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
Ravago Jonesd208ae72023-02-13 02:24:07 -0800181 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800182 mrs r3, PRIMASK
183 cpsid i
184#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
185 str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET]
186 str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET]
187 cmp r1, #0
188 beq 1f
189 wait_div 2
190 // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last)
191 ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET]
192 ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET]
193#if PICO_DIVIDER_DISABLE_INTERRUPTS
194 msr PRIMASK, r3
195#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
196 bx lr
1971:
198#if PICO_DIVIDER_DISABLE_INTERRUPTS
199 msr PRIMASK, r3
200#endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */
201 push {r2, lr}
202 cmp r0, #0
203 beq 1f
204 movs r0, #0
205 mvns r0, r0
2061:
207#if PICO_DIVIDER_CALL_IDIV0
208 bl __aeabi_idiv0
209#endif
210 movs r1, #0 // remainder 0
211 // need to restore saved r2 as it hold SIO ptr
212 pop {r2, pc}
213#if !PICO_DIVIDER_DISABLE_INTERRUPTS
214.align 2
215regular_func divmod_u32u32_savestate
216 save_div_state_and_lr
217 bl divmod_u32u32_unsafe
218 restore_div_state_and_return
219#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
220
221div_section WRAPPER_FUNC_NAME(__aeabi_ldiv)
222
223.align 2
224wrapper_func __aeabi_ldivmod
225regular_func div_s64s64
226regular_func divmod_s64s64
227#if !PICO_DIVIDER_DISABLE_INTERRUPTS
228 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
229 mov ip, r2
Ravago Jonesd208ae72023-02-13 02:24:07 -0800230 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800231 ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
232 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
233 mov r2, ip
234 bcs divmod_s64s64_savestate
235 b divmod_s64s64_unsafe
236.align 2
237divmod_s64s64_savestate:
238 save_div_state_and_lr_64
239 bl divmod_s64s64_unsafe
240 restore_div_state_and_return_64
241#else
242 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
243 push {r4, lr}
244 mrs r4, PRIMASK
245 cpsid i
246 bl divmod_s64s64_unsafe
247 msr PRIMASK, r4
248 pop {r4, pc}
249#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
250
251.align 2
252wrapper_func __aeabi_uldivmod
253regular_func div_u64u64
254regular_func divmod_u64u64
255#if !PICO_DIVIDER_DISABLE_INTERRUPTS
256 // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty
257 mov ip, r2
Ravago Jonesd208ae72023-02-13 02:24:07 -0800258 ldr r2, =SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800259 ldr r2, [r2, #SIO_DIV_CSR_OFFSET]
260 lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY
261 mov r2, ip
262 bcs divmod_u64u64_savestate
263 b divmod_u64u64_unsafe
264.align 2
265regular_func divmod_u64u64_savestate
266 save_div_state_and_lr_64
267 bl divmod_u64u64_unsafe
268 restore_div_state_and_return_64
269#else
270 // to avoid worrying about IRQs (or context switches), simply disable interrupts around call
271 push {r4, lr}
272 mrs r4, PRIMASK
273 cpsid i
274 bl divmod_u64u64_unsafe
275 msr PRIMASK, r4
276 pop {r4, pc}
277#endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */
278
279.macro dneg lo,hi
280 mvns \hi,\hi
Ravago Jonesd208ae72023-02-13 02:24:07 -0800281 negs \lo,\lo
Austin Schuh208337d2022-01-01 14:29:11 -0800282 bne l\@_1
283 adds \hi,#1
284l\@_1:
285.endm
286
287.align 2
288regular_func divmod_s64s64_unsafe
289 cmp r3,#0
290 blt 1f
291@ here x +ve
292 beq 2f @ could x be zero?
2933:
294 cmp r1,#0
295 bge divmod_u64u64_unsafe @ both positive
296@ y -ve, x +ve
297 push {r14}
298 dneg r0,r1
299 bl divmod_u64u64_unsafe
300 dneg r0,r1
301 dneg r2,r3
302 pop {r15}
303
3042:
305 cmp r2,#0
306 bne 3b @ back if x not zero
307
308 cmp r0,#0 @ y==0?
309 bne 4f
310 cmp r1,#0
311 beq 5f @ then pass 0 to __aeabi_ldiv0
3124:
313 movs r0,#0
314 lsrs r1,#31
315 lsls r1,#31 @ get sign bit
316 bne 5f @ y -ve? pass -2^63 to __aeabi_ldiv0
317 mvns r0,r0
318 lsrs r1,r0,#1 @ y +ve: pass 2^63-1 to __aeabi_ldiv0
3195:
320 push {r14}
321#if PICO_DIVIDER_CALL_LDIV0
322 bl __aeabi_ldiv0
323#endif
324 movs r2,#0 @ and return 0 for the remainder
325 movs r3,#0
326 pop {r15}
327
3281:
329@ here x -ve
330 push {r14}
331 cmp r1,#0
332 blt 1f
333@ y +ve, x -ve
334 dneg r2,r3
335 bl divmod_u64u64_unsafe
336 dneg r0,r1
337 pop {r15}
338
3391:
340@ y -ve, x -ve
341 dneg r0,r1
342 dneg r2,r3
343 bl divmod_u64u64_unsafe
344 dneg r2,r3
345 pop {r15}
346
347regular_func divmod_u64u64_unsafe
348 cmp r1,#0
349 bne y64 @ y fits in 32 bits?
350 cmp r3,#0 @ yes; and x?
351 bne 1f
352 cmp r2,#0
353 beq 2f @ x==0?
354 mov r12,r7
Ravago Jonesd208ae72023-02-13 02:24:07 -0800355 ldr r7,=SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800356 str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
357 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
358 movs r1,#0
359 movs r3,#0
360 wait_div 2
361 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET]
362 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET]
363 mov r7,r12
364 bx r14
365
3662: @ divide by 0 with y<2^32
367 cmp r0,#0 @ y==0?
368 beq 3f @ then pass 0 to __aeabi_ldiv0
369udiv0:
Ravago Jonesd208ae72023-02-13 02:24:07 -0800370 ldr r0,=0xffffffff
Austin Schuh208337d2022-01-01 14:29:11 -0800371 movs r1,r0 @ pass 2^64-1 to __aeabi_ldiv0
3723:
373 push {r14}
374#if PICO_DIVIDER_CALL_LDIV0
375 bl __aeabi_ldiv0
376#endif
377 movs r2,#0 @ and return 0 for the remainder
378 movs r3,#0
379 pop {r15}
380
3811:
382 movs r2,r0 @ x>y, so result is 0 remainder y
383 movs r3,r1
384 movs r0,#0
385 movs r1,#0
386 bx r14
387
388.ltorg
389
390@ here y occupies more than 32 bits
391@ split into cases acccording to the size of x
392y64:
393 cmp r3,#0
394 beq 1f
395 b y64_x48 @ if x does not fit in 32 bits, go to 48- and 64-bit cases
3961:
397 lsrs r3,r2,#16
398 bne y64_x32 @ jump if x is 17..32 bits
399
400@ here x is at most 16 bits
401
402 cmp r2,#0
403 beq udiv0 @ x==0? exit as with y!=0 case above
404 push {r7}
Ravago Jonesd208ae72023-02-13 02:24:07 -0800405 ldr r7,=SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800406 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
407 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET]
408 wait_div 4
409 push {r4, r5}
410 lsrs r4,r0,#16
411 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x
412 ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q0=y0/x;
413 lsls r3,#16
414 orrs r3,r4
415 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
416 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty
417 wait_div 1
418 uxth r4,r0
419 ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x
420 ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q1=y1/x;
421 lsls r3,#16
422 orrs r3,r4
423 str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16);
424 str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty
425 wait_div 3
426 movs r3,#0
427 lsls r4,r5,#16 @ quotient=(q0<<32)+(q1<<16)+q2
428 lsrs r5,#16
429 ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x
430 ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q2=y2/x;
431 adds r0,r4
432 adcs r1,r5
433 pop {r4,r5,r7}
434 bx r14
435
436.ltorg
437
438y64_x32:
439@ here x is 17..32 bits
440 push {r4-r7,r14}
441 mov r12,r2 @ save x
442 movs r5,#0 @ xsh=0
443 lsrs r4,r2,#24
444 bne 1f
445 lsls r2,#8 @ if(x0<1U<<24) x0<<=8,xsh =8;
446 adds r5,#8
4471:
448 lsrs r4,r2,#28
449 bne 1f
450 lsls r2,#4 @ if(x0<1U<<28) x0<<=4,xsh+=4;
451 adds r5,#4
4521:
453 lsrs r4,r2,#30
454 bne 1f
455 lsls r2,#2 @ if(x0<1U<<30) x0<<=2,xsh+=2;
456 adds r5,#2
4571:
458 lsrs r4,r2,#31
459 bne 1f
460 lsls r2,#1 @ if(x0<1U<<31) x0<<=1,xsh+=1;
461 adds r5,#1
4621:
463@ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49
464 lsrs r4,r2,#15
465 adds r4,#1 @ x1=(x0>>15)+1; 2^16<x1<=2^17
466
Ravago Jonesd208ae72023-02-13 02:24:07 -0800467 ldr r7,=SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800468 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
Ravago Jonesd208ae72023-02-13 02:24:07 -0800469 ldr r4,=0xffffffff
Austin Schuh208337d2022-01-01 14:29:11 -0800470 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
471 lsrs r6,r1,#16
472 uxth r3,r2 @ x0l
473 wait_div 2
474 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
475
476@ here
477@ r0:r1 y
478@ r2 x0
479@ r4 r
480@ r5 xsh
481@ r12 x
482
483 muls r6,r4
484 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16;
485 lsls r7,r6,#13
486 mov r14,r7 @ quh=q0<<13
487
488 muls r3,r6 @ x0l*q
489 lsrs r7,r3,#15
490 lsls r3,#17 @ r3:r7 is (x0l*q)<<17
491 subs r0,r3
492 sbcs r1,r7 @ y-=(x0l*q)<<17
493
494 lsrs r3,r2,#16 @ x0h
495 muls r3,r6 @ q*x0h
496 adds r3,r3
497 subs r1,r3 @ y-=(x0h*q)<<17
498
499 lsrs r6,r1,#3
500 muls r6,r4
501 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16;
502 add r14,r6 @ quh+=q1
503
504 uxth r3,r2 @ x0l
505 muls r3,r6 @ x0l*q
506 lsrs r7,r3,#28
507 lsls r3,#4 @ r3:r7 is (x0l*q)<<4
508 subs r0,r3
509 sbcs r1,r7 @ y-=(x0l*q)<<4
510
511 lsrs r3,r2,#16 @ x0h
512 muls r3,r6 @ x0h*q
513 lsrs r7,r3,#12
514 lsls r3,#20 @ r3:r7 is (x0h*q)<<4
515 subs r0,r3
516 sbcs r1,r7 @ y-=(x0h*q)<<4
517
518 lsrs r6,r0,#22
519 lsls r7,r1,#10
520 orrs r6,r7 @ y>>22
521 muls r6,r4
522 lsrs r6,#16 @ q=((ui32)(y>>22)*r)>>16;
523
524 cmp r5,#9
525 blt last0 @ if(xsh<9) goto last0;
526
527@ on this path xsh>=9, which means x<2^23
528 lsrs r2,#9 @ x0>>9: this shift loses no bits
529@ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient
530@ bits (at most 6 bits) times x, and so fits in one word
531 muls r2,r6 @ x0*q
532 subs r0,r2 @ y-x0*q
533 lsls r7,r6,#13 @ qul=q<<13
5341:
535 lsrs r6,r0,#9
536 muls r6,r4
537 lsrs r6,#16 @ q=((ui32)(y>>9)*r)>>16;
538
539@ here
540@ r0 y
541@ r2 x0>>9
542@ r5 xsh
543@ r6 q
544@ r7 qul
545@ r12 x
546@ r14 quh
547
548 movs r3,#22
549 subs r3,r5 @ 22-xsh
550 lsrs r6,r3 @ q>>=22-xsh
551 lsrs r7,r3 @ qul>>=22-xsh
552 adds r7,r6 @ qul+=q
553 mov r4,r12
554 muls r6,r4 @ x*q
555 subs r2,r0,r6 @ y-=x*q
556 mov r0,r14 @ quh
557 adds r5,#4 @ xsh+4
558 adds r3,#6 @ 28-xsh
559 movs r1,r0
560 lsrs r1,r3
561 lsls r0,r5 @ r0:r1 is quh<<(4+xsh)
562 adds r0,r7
563 bcc 1f
5642:
565 adds r1,#1
5661: @ qu=((ui64)quh<<(4+xsh))+qul
567 cmp r2,r4
568 bhs 3f
569 movs r3,#0
570 pop {r4-r7,r15}
571
572.ltorg
573
5743:
575 subs r2,r4
576 adds r0,#1
577 bcc 1b
578 b 2b @ while(y>=x) y-=x,qu++;
579
580@ here:
581@ r0:r1 y
582@ r2 x0
583@ r4 r
584@ r5 xsh; xsh<9
585@ r6 q
586
587last0:
588 movs r7,#9
589 subs r7,r5 @ 9-xsh
590 lsrs r6,r7
591 mov r4,r12 @ x
592 uxth r2,r4
593 muls r2,r6 @ q*xlo
594 subs r0,r2
595 bcs 1f
596 subs r1,#1 @ y-=q*xlo
5971:
598 lsrs r2,r4,#16 @ xhi
599 muls r2,r6 @ q*xhi
600 lsrs r3,r2,#16
601 lsls r2,#16
602 subs r2,r0,r2
603 sbcs r1,r3 @ y-q*xhi
604 movs r3,r1 @ y now in r2:r3
605 mov r0,r14 @ quh
606 adds r5,#4 @ xsh+4
607 adds r7,#19 @ 28-xsh
608 movs r1,r0
609 lsrs r1,r7
610 lsls r0,r5 @ r0:r1 is quh<<(4+xsh)
611 adds r0,r6
612 bcc 1f
613 adds r1,#1 @ quh<<(xsh+4))+q
6141:
615 cmp r3,#0 @ y>=2^32?
616 bne 3f
617 cmp r2,r4 @ y>=x?
618 bhs 4f
619 pop {r4-r7,r15}
620
6213:
622 adds r0,#1 @ qu++
623 bcc 2f
624 adds r1,#1
6252:
626 subs r2,r4 @ y-=x
627 bcs 3b
628 subs r3,#1
629 bne 3b
630
6311:
632 cmp r2,r4
633 bhs 4f
634 pop {r4-r7,r15}
635
6364:
637 adds r0,#1 @ qu++
638 bcc 2f
639 adds r1,#1
6402:
641 subs r2,r4 @ y-=x
642 b 1b
643
644y64_x48:
645@ here x is 33..64 bits
646 push {r4-r7,r14} @ save a copy of x
647 lsrs r4,r3,#16
648 beq 1f
649 b y64_x64 @ jump if x is 49..64 bits
6501:
651 push {r2-r3} @ save a copy of x
652@ here x is 33..48 bits
653 movs r5,#0 @ xsh=0
654 lsrs r4,r3,#8
655 bne 1f
656 lsls r3,#8
657 lsrs r6,r2,#24
658 orrs r3,r6
659 lsls r2,#8 @ if(x0<1U<<40) x0<<=8,xsh =8;
660 adds r5,#8
6611:
662 lsrs r4,r3,#12
663 bne 1f
664 lsls r3,#4
665 lsrs r6,r2,#28
666 orrs r3,r6
667 lsls r2,#4 @ if(x0<1U<<44) x0<<=4,xsh+=4;
668 adds r5,#4
6691:
670 lsrs r4,r3,#14
671 bne 1f
672 lsls r3,#2
673 lsrs r6,r2,#30
674 orrs r3,r6
675 lsls r2,#2 @ if(x0<1U<<46) x0<<=2,xsh+=2;
676 adds r5,#2
6771:
678 lsrs r4,r3,#15
679 bne 1f
680 adds r2,r2
681 adcs r3,r3 @ if(x0<1U<<47) x0<<=1,xsh+=1;
682 adds r5,#1
6831:
684@ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33
685 movs r4,r3
686 adds r7,r2,r2
687 adcs r4,r4
688 adds r4,#1 @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17
689
Ravago Jonesd208ae72023-02-13 02:24:07 -0800690 ldr r7,=SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800691 str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET]
Ravago Jonesd208ae72023-02-13 02:24:07 -0800692 ldr r4,=0xffffffff
Austin Schuh208337d2022-01-01 14:29:11 -0800693 str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
694 lsrs r6,r1,#16
695 wait_div 1
696 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate
697
698@ here
699@ r0:r1 y
700@ r2:r3 x0
701@ r4 r
702@ r5 xsh 0<=xsh<16
703
704 muls r6,r4
705 lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16;
706 lsls r7,r6,#13
707 mov r14,r7 @ save q<<13
708 uxth r7,r2 @ x0l
709 muls r7,r6
710 subs r0,r7
711 bcs 1f
712 subs r1,#1
7131:
714 subs r0,r7
715 bcs 1f
716 subs r1,#1
7171:
718 uxth r7,r3 @ x0h
719 muls r7,r6
720 subs r1,r7
721 subs r1,r7
722 lsrs r7,r2,#16 @ x0m
723 muls r7,r6
724 lsls r6,r7,#17
725 lsrs r7,#15
726 subs r0,r6
727 sbcs r1,r7 @ y-=((ui64)q*x0)<<1;
728
729 lsrs r6,r1,#3 @ y>>35
730 muls r6,r4
731 lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16;
732
733 cmp r5,#12
734 blt last1 @ if(xsh<12) goto last1;
735
736 add r14,r6 @ qu<<13+q
737 lsrs r2,#12
738 lsls r7,r3,#20
739 orrs r2,r7
740 lsrs r3,#12 @ x0>>12
741
742 uxth r7,r2 @ x0l
743 muls r7,r6
744 subs r0,r7
745 bcs 1f
746 subs r1,#1
7471:
748 uxth r7,r3 @ x0h
749 muls r7,r6
750 subs r1,r7
751 lsrs r7,r2,#16 @ x0m
752 muls r7,r6
753 lsls r6,r7,#16
754 lsrs r7,#16
755 subs r0,r6
756 sbcs r1,r7 @ y-=((ui64)q*x0)>>12
757
758 lsrs r6,r0,#22
759 lsls r7,r1,#10
760 orrs r6,r7 @ y>>22
761 muls r6,r4
762 movs r7,#41
763 subs r7,r5
764 lsrs r6,r7 @ q=((ui32)(y>>22)*r)>>(16+25-xsh)
765
766 subs r5,#12
767 mov r7,r14
768 lsls r7,r5
7692:
770 adds r7,r6 @ qu=(qu<<(xsh-12))+q
771 pop {r4,r5} @ recall x
772
773@ here
774@ r0:r1 y
775@ r4:r5 x
776@ r6 q
777@ r7 qu
778
779 uxth r2,r4
780 uxth r3,r5
781 muls r2,r6 @ xlo*q
782 muls r3,r6 @ xhi*q
783 subs r0,r2
784 sbcs r1,r3
785 lsrs r2,r4,#16
786 muls r2,r6
787 lsrs r3,r2,#16
788 lsls r2,#16 @ xm*q
789 subs r0,r2
790 sbcs r1,r3 @ y-=(ui64)q*x
791
7921:
793 movs r2,r0
794 movs r3,r1
795 adds r7,#1
796 subs r0,r4
797 sbcs r1,r5 @ while(y>=x) y-=x,qu++;
798 bhs 1b
799 subs r0,r7,#1 @ correction to qu
800 movs r1,#0
801 pop {r4-r7,r15}
802
803last1:
804@ r0:r1 y
805@ r2:r3 x0
806@ r5 xsh
807@ r6 q
808
809 movs r7,#12
810 subs r7,r5
811 lsrs r6,r7 @ q>>=12-xsh
812 mov r7,r14
813 lsrs r7,#13
814 lsls r7,r5
815 adds r7,r7 @ qu<<(xsh+1)
816 b 2b
817
818y64_x64:
819@ here x is 49..64 bits
820 movs r4,#0 @ q=0 if x>>32==0xffffffff
821 adds r5,r3,#1
822 beq 1f
823
Ravago Jonesd208ae72023-02-13 02:24:07 -0800824 ldr r7,=SIO_BASE
Austin Schuh208337d2022-01-01 14:29:11 -0800825 str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET]
826 str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET]
827 wait_div 0
828 ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1)
8291:
830 uxth r5,r2
831 uxth r6,r3
832 muls r5,r4
833 muls r6,r4
834 subs r0,r5
835 sbcs r1,r6
836 lsrs r5,r2,#16
837 lsrs r6,r3,#16
838 muls r5,r4
839 muls r6,r4
840 lsls r6,#16
841 lsrs r7,r5,#16
842 orrs r6,r7
843 lsls r5,#16
844 subs r0,r5
845 sbcs r1,r6 @ y-=(ui64)q*x
846
847 cmp r1,r3 @ while(y>=x) y-=x,q++
848 bhs 1f
8493:
850 movs r2,r0
851 movs r3,r1
852 movs r0,r4
853 movs r1,#0
854 pop {r4-r7,r15}
855
8561:
857 bne 2f
858 cmp r0,r2
859 blo 3b
8602:
861 subs r0,r2
862 sbcs r1,r3
863 adds r4,#1
864 cmp r1,r3
865 blo 3b
866 b 1b
867
868div_section divmod_s64s64_rem
869regular_func divmod_s64s64_rem
870 push {r4, lr}
871 bl divmod_s64s64
872 ldr r4, [sp, #8]
873 stmia r4!, {r2,r3}
874 pop {r4, pc}
875
876div_section divmod_u64u64_rem
877regular_func divmod_u64u64_rem
878 push {r4, lr}
879 bl divmod_u64u64
880 ldr r4, [sp, #8]
881 stmia r4!, {r2,r3}
882 pop {r4, pc}