Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2020 Raspberry Pi (Trading) Ltd. |
| 3 | * |
| 4 | * SPDX-License-Identifier: BSD-3-Clause |
| 5 | */ |
| 6 | |
| 7 | #include "hardware/regs/addressmap.h" |
| 8 | #include "hardware/divider_helper.S" |
| 9 | |
| 10 | .syntax unified |
| 11 | .cpu cortex-m0plus |
| 12 | .thumb |
| 13 | |
| 14 | // PICO_CONFIG: PICO_DIVIDER_DISABLE_INTERRUPTS, Disable interrupts around division such that divider state need not be saved/restored in exception handlers, default=0, group=pico_divider |
| 15 | |
| 16 | #include "pico/asm_helper.S" |
| 17 | |
| 18 | // PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 32 bit division by zero should call __aeabi_idiv0, default=1, group=pico_divider |
| 19 | #ifndef PICO_DIVIDER_CALL_IDIV0 |
| 20 | #define PICO_DIVIDER_CALL_IDIV0 1 |
| 21 | #endif |
| 22 | |
| 23 | // PICO_CONFIG: PICO_DIVIDER_CALL_IDIV0, Whether 64 bit division by zero should call __aeabi_ldiv0, default=1, group=pico_divider |
| 24 | #ifndef PICO_DIVIDER_CALL_LDIV0 |
| 25 | #define PICO_DIVIDER_CALL_LDIV0 1 |
| 26 | #endif |
| 27 | |
| 28 | // PICO_CONFIG: PICO_DIVIDER_IN_RAM, Whether divider functions should be placed in RAM, default=0, group=pico_divider |
| 29 | .macro div_section name |
| 30 | #if PICO_DIVIDER_IN_RAM |
| 31 | .section RAM_SECTION_NAME(\name), "ax" |
| 32 | #else |
| 33 | .section SECTION_NAME(\name), "ax" |
| 34 | #endif |
| 35 | .endm |
| 36 | |
| 37 | @ wait 8-n cycles for the hardware divider |
| 38 | .macro wait_div n |
| 39 | .rept (8-\n) / 2 |
| 40 | b 9f |
| 41 | 9: |
| 42 | .endr |
| 43 | .if (8-\n) % 2 |
| 44 | nop |
| 45 | .endif |
| 46 | .endm |
| 47 | |
| 48 | #if (SIO_DIV_SDIVISOR_OFFSET != SIO_DIV_SDIVIDEND_OFFSET + 4) || (SIO_DIV_QUOTIENT_OFFSET != SIO_DIV_SDIVISOR_OFFSET + 4) || (SIO_DIV_REMAINDER_OFFSET != SIO_DIV_QUOTIENT_OFFSET + 4) |
| 49 | #error register layout has changed - we rely on this order to make sure we save/restore in the right order |
| 50 | #endif |
| 51 | |
| 52 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 53 | .macro save_div_state_and_lr_64 |
| 54 | push {r4, r5, r6, r7, lr} |
| 55 | ldr r6, =SIO_BASE |
| 56 | 1: |
| 57 | ldr r5, [r6, #SIO_DIV_CSR_OFFSET] |
| 58 | // wait for results as we can't save signed-ness of operation |
| 59 | lsrs r5, #SIO_DIV_CSR_READY_SHIFT_FOR_CARRY |
| 60 | bcc 1b |
| 61 | // note we must read quotient last, and since it isn't the last reg, we'll not use ldmia! |
| 62 | ldr r4, [r6, #SIO_DIV_UDIVIDEND_OFFSET] |
| 63 | ldr r5, [r6, #SIO_DIV_UDIVISOR_OFFSET] |
| 64 | ldr r7, [r6, #SIO_DIV_REMAINDER_OFFSET] |
| 65 | ldr r6, [r6, #SIO_DIV_QUOTIENT_OFFSET] |
| 66 | .endm |
| 67 | |
| 68 | .macro restore_div_state_and_return_64 |
| 69 | // writing sdividend (r4), sdivisor (r5), quotient (r6), remainder (r7) in that order |
| 70 | // |
| 71 | // it is worth considering what happens if we are interrupted |
| 72 | // |
| 73 | // after writing r4: we are DIRTY and !READY |
| 74 | // ... interruptor using div will complete based on incorrect inputs, but dividend at least will be |
| 75 | // saved/restored correctly and we'll restore the rest ourselves |
| 76 | // after writing r4, r5: we are DIRTY and !READY |
| 77 | // ... interruptor using div will complete based on possibly wrongly signed inputs, but dividend, divisor |
| 78 | // at least will be saved/restored correctly and and we'll restore the rest ourselves |
| 79 | // after writing r4, r5, r6: we are DIRTY and READY |
| 80 | // ... interruptor using div will dividend, divisor, quotient registers as is (what we just restored ourselves), |
| 81 | // and we'll restore the remainder after the fact |
| 82 | |
| 83 | mov ip, r2 |
| 84 | ldr r2, =SIO_BASE |
| 85 | // note we are not use STM not because it can be restarted due to interrupt which is harmless, more because this is 1 cycle IO space |
| 86 | // and so 4 reads is cheaper (and we don't have to adjust r2) |
| 87 | str r4, [r2, #SIO_DIV_UDIVIDEND_OFFSET] |
| 88 | str r5, [r2, #SIO_DIV_UDIVISOR_OFFSET] |
| 89 | str r7, [r2, #SIO_DIV_REMAINDER_OFFSET] |
| 90 | str r6, [r2, #SIO_DIV_QUOTIENT_OFFSET] |
| 91 | mov r2, ip |
| 92 | pop {r4, r5, r6, r7, pc} |
| 93 | .endm |
| 94 | |
| 95 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 96 | |
| 97 | // since idiv and idivmod only differ by a cycle, we'll make them the same! |
| 98 | div_section WRAPPER_FUNC_NAME(__aeabi_idiv) |
| 99 | .align 2 |
| 100 | wrapper_func __aeabi_idiv |
| 101 | wrapper_func __aeabi_idivmod |
| 102 | regular_func div_s32s32 |
| 103 | regular_func divmod_s32s32 |
| 104 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 105 | // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 106 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 107 | ldr r3, [r2, #SIO_DIV_CSR_OFFSET] |
| 108 | lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY |
| 109 | bcs divmod_s32s32_savestate |
| 110 | regular_func divmod_s32s32_unsafe |
| 111 | #else |
| 112 | // to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs |
| 113 | // in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there |
| 114 | // are the hardware_divider functions that can be used instead anyway |
| 115 | regular_func divmod_s32s32_unsafe |
| 116 | // to avoid worrying about IRQs (or context switches), simply disable interrupts around call |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 117 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 118 | mrs r3, PRIMASK |
| 119 | cpsid i |
| 120 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 121 | str r0, [r2, #SIO_DIV_SDIVIDEND_OFFSET] |
| 122 | str r1, [r2, #SIO_DIV_SDIVISOR_OFFSET] |
| 123 | cmp r1, #0 |
| 124 | beq 1f |
| 125 | wait_div 2 |
| 126 | // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) |
| 127 | ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] |
| 128 | ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] |
| 129 | #if PICO_DIVIDER_DISABLE_INTERRUPTS |
| 130 | msr PRIMASK, r3 |
| 131 | #endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 132 | bx lr |
| 133 | 1: |
| 134 | #if PICO_DIVIDER_DISABLE_INTERRUPTS |
| 135 | msr PRIMASK, r3 |
| 136 | #endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 137 | push {r2, lr} |
| 138 | movs r1, #0x80 |
| 139 | lsls r1, #24 |
| 140 | asrs r2, r0, #31 |
| 141 | eors r1, r2 |
| 142 | cmp r0, #0 |
| 143 | beq 1f |
| 144 | mvns r0, r1 |
| 145 | 1: |
| 146 | #if PICO_DIVIDER_CALL_IDIV0 |
| 147 | bl __aeabi_idiv0 |
| 148 | #endif |
| 149 | movs r1, #0 // remainder 0 |
| 150 | // need to restore saved r2 as it hold SIO ptr |
| 151 | pop {r2, pc} |
| 152 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 153 | .align 2 |
| 154 | regular_func divmod_s32s32_savestate |
| 155 | // note that we must be at least 2 cycles into division at this point, |
| 156 | // which we are because of the firty check before getting here (and of course the function call before that) |
| 157 | save_div_state_and_lr |
| 158 | bl divmod_s32s32_unsafe |
| 159 | restore_div_state_and_return |
| 160 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 161 | |
| 162 | // since uidiv and uidivmod only differ by a cycle, we'll make them the same! |
| 163 | div_section WRAPPER_FUNC_NAME(__aeabi_uidiv) |
| 164 | regular_func div_u32u32 |
| 165 | regular_func divmod_u32u32 |
| 166 | wrapper_func __aeabi_uidiv |
| 167 | wrapper_func __aeabi_uidivmod |
| 168 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 169 | // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 170 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 171 | ldr r3, [r2, #SIO_DIV_CSR_OFFSET] |
| 172 | lsrs r3, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY |
| 173 | bcs divmod_u32u32_savestate |
| 174 | regular_func divmod_u32u32_unsafe |
| 175 | #else |
| 176 | // to avoid too much source code spaghetti with restoring interrupts, we make this the same as the other funcs |
| 177 | // in the PICO_DIVIDER_DISABLE_INTERRUPTS case; i.e. it is not a faster function; this seems reasonable as there |
| 178 | // are the hardware_divider functions that can be used instead anyway |
| 179 | regular_func divmod_u32u32_unsafe |
| 180 | // to avoid worrying about IRQs (or context switches), simply disable interrupts around call |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 181 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 182 | mrs r3, PRIMASK |
| 183 | cpsid i |
| 184 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 185 | str r0, [r2, #SIO_DIV_UDIVIDEND_OFFSET] |
| 186 | str r1, [r2, #SIO_DIV_UDIVISOR_OFFSET] |
| 187 | cmp r1, #0 |
| 188 | beq 1f |
| 189 | wait_div 2 |
| 190 | // return 64 bit value so we can efficiently return both (note read order is important since QUOTIENT must be read last) |
| 191 | ldr r1, [r2, #SIO_DIV_REMAINDER_OFFSET] |
| 192 | ldr r0, [r2, #SIO_DIV_QUOTIENT_OFFSET] |
| 193 | #if PICO_DIVIDER_DISABLE_INTERRUPTS |
| 194 | msr PRIMASK, r3 |
| 195 | #endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 196 | bx lr |
| 197 | 1: |
| 198 | #if PICO_DIVIDER_DISABLE_INTERRUPTS |
| 199 | msr PRIMASK, r3 |
| 200 | #endif /* PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 201 | push {r2, lr} |
| 202 | cmp r0, #0 |
| 203 | beq 1f |
| 204 | movs r0, #0 |
| 205 | mvns r0, r0 |
| 206 | 1: |
| 207 | #if PICO_DIVIDER_CALL_IDIV0 |
| 208 | bl __aeabi_idiv0 |
| 209 | #endif |
| 210 | movs r1, #0 // remainder 0 |
| 211 | // need to restore saved r2 as it hold SIO ptr |
| 212 | pop {r2, pc} |
| 213 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 214 | .align 2 |
| 215 | regular_func divmod_u32u32_savestate |
| 216 | save_div_state_and_lr |
| 217 | bl divmod_u32u32_unsafe |
| 218 | restore_div_state_and_return |
| 219 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 220 | |
| 221 | div_section WRAPPER_FUNC_NAME(__aeabi_ldiv) |
| 222 | |
| 223 | .align 2 |
| 224 | wrapper_func __aeabi_ldivmod |
| 225 | regular_func div_s64s64 |
| 226 | regular_func divmod_s64s64 |
| 227 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 228 | // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty |
| 229 | mov ip, r2 |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 230 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 231 | ldr r2, [r2, #SIO_DIV_CSR_OFFSET] |
| 232 | lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY |
| 233 | mov r2, ip |
| 234 | bcs divmod_s64s64_savestate |
| 235 | b divmod_s64s64_unsafe |
| 236 | .align 2 |
| 237 | divmod_s64s64_savestate: |
| 238 | save_div_state_and_lr_64 |
| 239 | bl divmod_s64s64_unsafe |
| 240 | restore_div_state_and_return_64 |
| 241 | #else |
| 242 | // to avoid worrying about IRQs (or context switches), simply disable interrupts around call |
| 243 | push {r4, lr} |
| 244 | mrs r4, PRIMASK |
| 245 | cpsid i |
| 246 | bl divmod_s64s64_unsafe |
| 247 | msr PRIMASK, r4 |
| 248 | pop {r4, pc} |
| 249 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 250 | |
| 251 | .align 2 |
| 252 | wrapper_func __aeabi_uldivmod |
| 253 | regular_func div_u64u64 |
| 254 | regular_func divmod_u64u64 |
| 255 | #if !PICO_DIVIDER_DISABLE_INTERRUPTS |
| 256 | // to support IRQ usage (or context switch) we must save/restore divider state around call if state is dirty |
| 257 | mov ip, r2 |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 258 | ldr r2, =SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 259 | ldr r2, [r2, #SIO_DIV_CSR_OFFSET] |
| 260 | lsrs r2, #SIO_DIV_CSR_DIRTY_SHIFT_FOR_CARRY |
| 261 | mov r2, ip |
| 262 | bcs divmod_u64u64_savestate |
| 263 | b divmod_u64u64_unsafe |
| 264 | .align 2 |
| 265 | regular_func divmod_u64u64_savestate |
| 266 | save_div_state_and_lr_64 |
| 267 | bl divmod_u64u64_unsafe |
| 268 | restore_div_state_and_return_64 |
| 269 | #else |
| 270 | // to avoid worrying about IRQs (or context switches), simply disable interrupts around call |
| 271 | push {r4, lr} |
| 272 | mrs r4, PRIMASK |
| 273 | cpsid i |
| 274 | bl divmod_u64u64_unsafe |
| 275 | msr PRIMASK, r4 |
| 276 | pop {r4, pc} |
| 277 | #endif /* !PICO_DIVIDER_DISABLE_INTERRUPTS */ |
| 278 | |
| 279 | .macro dneg lo,hi |
| 280 | mvns \hi,\hi |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 281 | negs \lo,\lo |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 282 | bne l\@_1 |
| 283 | adds \hi,#1 |
| 284 | l\@_1: |
| 285 | .endm |
| 286 | |
| 287 | .align 2 |
| 288 | regular_func divmod_s64s64_unsafe |
| 289 | cmp r3,#0 |
| 290 | blt 1f |
| 291 | @ here x +ve |
| 292 | beq 2f @ could x be zero? |
| 293 | 3: |
| 294 | cmp r1,#0 |
| 295 | bge divmod_u64u64_unsafe @ both positive |
| 296 | @ y -ve, x +ve |
| 297 | push {r14} |
| 298 | dneg r0,r1 |
| 299 | bl divmod_u64u64_unsafe |
| 300 | dneg r0,r1 |
| 301 | dneg r2,r3 |
| 302 | pop {r15} |
| 303 | |
| 304 | 2: |
| 305 | cmp r2,#0 |
| 306 | bne 3b @ back if x not zero |
| 307 | |
| 308 | cmp r0,#0 @ y==0? |
| 309 | bne 4f |
| 310 | cmp r1,#0 |
| 311 | beq 5f @ then pass 0 to __aeabi_ldiv0 |
| 312 | 4: |
| 313 | movs r0,#0 |
| 314 | lsrs r1,#31 |
| 315 | lsls r1,#31 @ get sign bit |
| 316 | bne 5f @ y -ve? pass -2^63 to __aeabi_ldiv0 |
| 317 | mvns r0,r0 |
| 318 | lsrs r1,r0,#1 @ y +ve: pass 2^63-1 to __aeabi_ldiv0 |
| 319 | 5: |
| 320 | push {r14} |
| 321 | #if PICO_DIVIDER_CALL_LDIV0 |
| 322 | bl __aeabi_ldiv0 |
| 323 | #endif |
| 324 | movs r2,#0 @ and return 0 for the remainder |
| 325 | movs r3,#0 |
| 326 | pop {r15} |
| 327 | |
| 328 | 1: |
| 329 | @ here x -ve |
| 330 | push {r14} |
| 331 | cmp r1,#0 |
| 332 | blt 1f |
| 333 | @ y +ve, x -ve |
| 334 | dneg r2,r3 |
| 335 | bl divmod_u64u64_unsafe |
| 336 | dneg r0,r1 |
| 337 | pop {r15} |
| 338 | |
| 339 | 1: |
| 340 | @ y -ve, x -ve |
| 341 | dneg r0,r1 |
| 342 | dneg r2,r3 |
| 343 | bl divmod_u64u64_unsafe |
| 344 | dneg r2,r3 |
| 345 | pop {r15} |
| 346 | |
| 347 | regular_func divmod_u64u64_unsafe |
| 348 | cmp r1,#0 |
| 349 | bne y64 @ y fits in 32 bits? |
| 350 | cmp r3,#0 @ yes; and x? |
| 351 | bne 1f |
| 352 | cmp r2,#0 |
| 353 | beq 2f @ x==0? |
| 354 | mov r12,r7 |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 355 | ldr r7,=SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 356 | str r0,[r7,#SIO_DIV_UDIVIDEND_OFFSET] |
| 357 | str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] |
| 358 | movs r1,#0 |
| 359 | movs r3,#0 |
| 360 | wait_div 2 |
| 361 | ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] |
| 362 | ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] |
| 363 | mov r7,r12 |
| 364 | bx r14 |
| 365 | |
| 366 | 2: @ divide by 0 with y<2^32 |
| 367 | cmp r0,#0 @ y==0? |
| 368 | beq 3f @ then pass 0 to __aeabi_ldiv0 |
| 369 | udiv0: |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 370 | ldr r0,=0xffffffff |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 371 | movs r1,r0 @ pass 2^64-1 to __aeabi_ldiv0 |
| 372 | 3: |
| 373 | push {r14} |
| 374 | #if PICO_DIVIDER_CALL_LDIV0 |
| 375 | bl __aeabi_ldiv0 |
| 376 | #endif |
| 377 | movs r2,#0 @ and return 0 for the remainder |
| 378 | movs r3,#0 |
| 379 | pop {r15} |
| 380 | |
| 381 | 1: |
| 382 | movs r2,r0 @ x>y, so result is 0 remainder y |
| 383 | movs r3,r1 |
| 384 | movs r0,#0 |
| 385 | movs r1,#0 |
| 386 | bx r14 |
| 387 | |
| 388 | .ltorg |
| 389 | |
| 390 | @ here y occupies more than 32 bits |
| 391 | @ split into cases acccording to the size of x |
| 392 | y64: |
| 393 | cmp r3,#0 |
| 394 | beq 1f |
| 395 | b y64_x48 @ if x does not fit in 32 bits, go to 48- and 64-bit cases |
| 396 | 1: |
| 397 | lsrs r3,r2,#16 |
| 398 | bne y64_x32 @ jump if x is 17..32 bits |
| 399 | |
| 400 | @ here x is at most 16 bits |
| 401 | |
| 402 | cmp r2,#0 |
| 403 | beq udiv0 @ x==0? exit as with y!=0 case above |
| 404 | push {r7} |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 405 | ldr r7,=SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 406 | str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] |
| 407 | str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] |
| 408 | wait_div 4 |
| 409 | push {r4, r5} |
| 410 | lsrs r4,r0,#16 |
| 411 | ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r0=y0-q0*x; 0<=r0<x |
| 412 | ldr r1,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q0=y0/x; |
| 413 | lsls r3,#16 |
| 414 | orrs r3,r4 |
| 415 | str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); |
| 416 | str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty |
| 417 | wait_div 1 |
| 418 | uxth r4,r0 |
| 419 | ldr r3,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r1=y1-q1*x; 0<=r1<x |
| 420 | ldr r5,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q1=y1/x; |
| 421 | lsls r3,#16 |
| 422 | orrs r3,r4 |
| 423 | str r3,[r7,#SIO_DIV_UDIVIDEND_OFFSET] @ y1=(r0<<16)+(((ui32)y)>>16); |
| 424 | str r2,[r7,#SIO_DIV_UDIVISOR_OFFSET] @ must set divisor again, as we do not save/restore regs at all in IRQs if not dirty |
| 425 | wait_div 3 |
| 426 | movs r3,#0 |
| 427 | lsls r4,r5,#16 @ quotient=(q0<<32)+(q1<<16)+q2 |
| 428 | lsrs r5,#16 |
| 429 | ldr r2,[r7,#SIO_DIV_REMAINDER_OFFSET] @ r2=y2-q2*x; 0<=r2<x |
| 430 | ldr r0,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q2=y2/x; |
| 431 | adds r0,r4 |
| 432 | adcs r1,r5 |
| 433 | pop {r4,r5,r7} |
| 434 | bx r14 |
| 435 | |
| 436 | .ltorg |
| 437 | |
| 438 | y64_x32: |
| 439 | @ here x is 17..32 bits |
| 440 | push {r4-r7,r14} |
| 441 | mov r12,r2 @ save x |
| 442 | movs r5,#0 @ xsh=0 |
| 443 | lsrs r4,r2,#24 |
| 444 | bne 1f |
| 445 | lsls r2,#8 @ if(x0<1U<<24) x0<<=8,xsh =8; |
| 446 | adds r5,#8 |
| 447 | 1: |
| 448 | lsrs r4,r2,#28 |
| 449 | bne 1f |
| 450 | lsls r2,#4 @ if(x0<1U<<28) x0<<=4,xsh+=4; |
| 451 | adds r5,#4 |
| 452 | 1: |
| 453 | lsrs r4,r2,#30 |
| 454 | bne 1f |
| 455 | lsls r2,#2 @ if(x0<1U<<30) x0<<=2,xsh+=2; |
| 456 | adds r5,#2 |
| 457 | 1: |
| 458 | lsrs r4,r2,#31 |
| 459 | bne 1f |
| 460 | lsls r2,#1 @ if(x0<1U<<31) x0<<=1,xsh+=1; |
| 461 | adds r5,#1 |
| 462 | 1: |
| 463 | @ now 2^31<=x0<2^32, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+33 33<=qb<49 |
| 464 | lsrs r4,r2,#15 |
| 465 | adds r4,#1 @ x1=(x0>>15)+1; 2^16<x1<=2^17 |
| 466 | |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 467 | ldr r7,=SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 468 | str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 469 | ldr r4,=0xffffffff |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 470 | str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] |
| 471 | lsrs r6,r1,#16 |
| 472 | uxth r3,r2 @ x0l |
| 473 | wait_div 2 |
| 474 | ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate |
| 475 | |
| 476 | @ here |
| 477 | @ r0:r1 y |
| 478 | @ r2 x0 |
| 479 | @ r4 r |
| 480 | @ r5 xsh |
| 481 | @ r12 x |
| 482 | |
| 483 | muls r6,r4 |
| 484 | lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; |
| 485 | lsls r7,r6,#13 |
| 486 | mov r14,r7 @ quh=q0<<13 |
| 487 | |
| 488 | muls r3,r6 @ x0l*q |
| 489 | lsrs r7,r3,#15 |
| 490 | lsls r3,#17 @ r3:r7 is (x0l*q)<<17 |
| 491 | subs r0,r3 |
| 492 | sbcs r1,r7 @ y-=(x0l*q)<<17 |
| 493 | |
| 494 | lsrs r3,r2,#16 @ x0h |
| 495 | muls r3,r6 @ q*x0h |
| 496 | adds r3,r3 |
| 497 | subs r1,r3 @ y-=(x0h*q)<<17 |
| 498 | |
| 499 | lsrs r6,r1,#3 |
| 500 | muls r6,r4 |
| 501 | lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; |
| 502 | add r14,r6 @ quh+=q1 |
| 503 | |
| 504 | uxth r3,r2 @ x0l |
| 505 | muls r3,r6 @ x0l*q |
| 506 | lsrs r7,r3,#28 |
| 507 | lsls r3,#4 @ r3:r7 is (x0l*q)<<4 |
| 508 | subs r0,r3 |
| 509 | sbcs r1,r7 @ y-=(x0l*q)<<4 |
| 510 | |
| 511 | lsrs r3,r2,#16 @ x0h |
| 512 | muls r3,r6 @ x0h*q |
| 513 | lsrs r7,r3,#12 |
| 514 | lsls r3,#20 @ r3:r7 is (x0h*q)<<4 |
| 515 | subs r0,r3 |
| 516 | sbcs r1,r7 @ y-=(x0h*q)<<4 |
| 517 | |
| 518 | lsrs r6,r0,#22 |
| 519 | lsls r7,r1,#10 |
| 520 | orrs r6,r7 @ y>>22 |
| 521 | muls r6,r4 |
| 522 | lsrs r6,#16 @ q=((ui32)(y>>22)*r)>>16; |
| 523 | |
| 524 | cmp r5,#9 |
| 525 | blt last0 @ if(xsh<9) goto last0; |
| 526 | |
| 527 | @ on this path xsh>=9, which means x<2^23 |
| 528 | lsrs r2,#9 @ x0>>9: this shift loses no bits |
| 529 | @ the remainder y-x0*q is guaranteed less than a very small multiple of the remaining quotient |
| 530 | @ bits (at most 6 bits) times x, and so fits in one word |
| 531 | muls r2,r6 @ x0*q |
| 532 | subs r0,r2 @ y-x0*q |
| 533 | lsls r7,r6,#13 @ qul=q<<13 |
| 534 | 1: |
| 535 | lsrs r6,r0,#9 |
| 536 | muls r6,r4 |
| 537 | lsrs r6,#16 @ q=((ui32)(y>>9)*r)>>16; |
| 538 | |
| 539 | @ here |
| 540 | @ r0 y |
| 541 | @ r2 x0>>9 |
| 542 | @ r5 xsh |
| 543 | @ r6 q |
| 544 | @ r7 qul |
| 545 | @ r12 x |
| 546 | @ r14 quh |
| 547 | |
| 548 | movs r3,#22 |
| 549 | subs r3,r5 @ 22-xsh |
| 550 | lsrs r6,r3 @ q>>=22-xsh |
| 551 | lsrs r7,r3 @ qul>>=22-xsh |
| 552 | adds r7,r6 @ qul+=q |
| 553 | mov r4,r12 |
| 554 | muls r6,r4 @ x*q |
| 555 | subs r2,r0,r6 @ y-=x*q |
| 556 | mov r0,r14 @ quh |
| 557 | adds r5,#4 @ xsh+4 |
| 558 | adds r3,#6 @ 28-xsh |
| 559 | movs r1,r0 |
| 560 | lsrs r1,r3 |
| 561 | lsls r0,r5 @ r0:r1 is quh<<(4+xsh) |
| 562 | adds r0,r7 |
| 563 | bcc 1f |
| 564 | 2: |
| 565 | adds r1,#1 |
| 566 | 1: @ qu=((ui64)quh<<(4+xsh))+qul |
| 567 | cmp r2,r4 |
| 568 | bhs 3f |
| 569 | movs r3,#0 |
| 570 | pop {r4-r7,r15} |
| 571 | |
| 572 | .ltorg |
| 573 | |
| 574 | 3: |
| 575 | subs r2,r4 |
| 576 | adds r0,#1 |
| 577 | bcc 1b |
| 578 | b 2b @ while(y>=x) y-=x,qu++; |
| 579 | |
| 580 | @ here: |
| 581 | @ r0:r1 y |
| 582 | @ r2 x0 |
| 583 | @ r4 r |
| 584 | @ r5 xsh; xsh<9 |
| 585 | @ r6 q |
| 586 | |
| 587 | last0: |
| 588 | movs r7,#9 |
| 589 | subs r7,r5 @ 9-xsh |
| 590 | lsrs r6,r7 |
| 591 | mov r4,r12 @ x |
| 592 | uxth r2,r4 |
| 593 | muls r2,r6 @ q*xlo |
| 594 | subs r0,r2 |
| 595 | bcs 1f |
| 596 | subs r1,#1 @ y-=q*xlo |
| 597 | 1: |
| 598 | lsrs r2,r4,#16 @ xhi |
| 599 | muls r2,r6 @ q*xhi |
| 600 | lsrs r3,r2,#16 |
| 601 | lsls r2,#16 |
| 602 | subs r2,r0,r2 |
| 603 | sbcs r1,r3 @ y-q*xhi |
| 604 | movs r3,r1 @ y now in r2:r3 |
| 605 | mov r0,r14 @ quh |
| 606 | adds r5,#4 @ xsh+4 |
| 607 | adds r7,#19 @ 28-xsh |
| 608 | movs r1,r0 |
| 609 | lsrs r1,r7 |
| 610 | lsls r0,r5 @ r0:r1 is quh<<(4+xsh) |
| 611 | adds r0,r6 |
| 612 | bcc 1f |
| 613 | adds r1,#1 @ quh<<(xsh+4))+q |
| 614 | 1: |
| 615 | cmp r3,#0 @ y>=2^32? |
| 616 | bne 3f |
| 617 | cmp r2,r4 @ y>=x? |
| 618 | bhs 4f |
| 619 | pop {r4-r7,r15} |
| 620 | |
| 621 | 3: |
| 622 | adds r0,#1 @ qu++ |
| 623 | bcc 2f |
| 624 | adds r1,#1 |
| 625 | 2: |
| 626 | subs r2,r4 @ y-=x |
| 627 | bcs 3b |
| 628 | subs r3,#1 |
| 629 | bne 3b |
| 630 | |
| 631 | 1: |
| 632 | cmp r2,r4 |
| 633 | bhs 4f |
| 634 | pop {r4-r7,r15} |
| 635 | |
| 636 | 4: |
| 637 | adds r0,#1 @ qu++ |
| 638 | bcc 2f |
| 639 | adds r1,#1 |
| 640 | 2: |
| 641 | subs r2,r4 @ y-=x |
| 642 | b 1b |
| 643 | |
| 644 | y64_x48: |
| 645 | @ here x is 33..64 bits |
| 646 | push {r4-r7,r14} @ save a copy of x |
| 647 | lsrs r4,r3,#16 |
| 648 | beq 1f |
| 649 | b y64_x64 @ jump if x is 49..64 bits |
| 650 | 1: |
| 651 | push {r2-r3} @ save a copy of x |
| 652 | @ here x is 33..48 bits |
| 653 | movs r5,#0 @ xsh=0 |
| 654 | lsrs r4,r3,#8 |
| 655 | bne 1f |
| 656 | lsls r3,#8 |
| 657 | lsrs r6,r2,#24 |
| 658 | orrs r3,r6 |
| 659 | lsls r2,#8 @ if(x0<1U<<40) x0<<=8,xsh =8; |
| 660 | adds r5,#8 |
| 661 | 1: |
| 662 | lsrs r4,r3,#12 |
| 663 | bne 1f |
| 664 | lsls r3,#4 |
| 665 | lsrs r6,r2,#28 |
| 666 | orrs r3,r6 |
| 667 | lsls r2,#4 @ if(x0<1U<<44) x0<<=4,xsh+=4; |
| 668 | adds r5,#4 |
| 669 | 1: |
| 670 | lsrs r4,r3,#14 |
| 671 | bne 1f |
| 672 | lsls r3,#2 |
| 673 | lsrs r6,r2,#30 |
| 674 | orrs r3,r6 |
| 675 | lsls r2,#2 @ if(x0<1U<<46) x0<<=2,xsh+=2; |
| 676 | adds r5,#2 |
| 677 | 1: |
| 678 | lsrs r4,r3,#15 |
| 679 | bne 1f |
| 680 | adds r2,r2 |
| 681 | adcs r3,r3 @ if(x0<1U<<47) x0<<=1,xsh+=1; |
| 682 | adds r5,#1 |
| 683 | 1: |
| 684 | @ now 2^47<=x0<2^48, 0<=xsh<16 (amount x is shifted in x0); number of quotient bits to be calculated qb=xsh+17 17<=qb<33 |
| 685 | movs r4,r3 |
| 686 | adds r7,r2,r2 |
| 687 | adcs r4,r4 |
| 688 | adds r4,#1 @ x1=(ui32)(x0>>31)+1; // 2^16<x1<=2^17 |
| 689 | |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 690 | ldr r7,=SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 691 | str r4,[r7,#SIO_DIV_UDIVISOR_OFFSET] |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 692 | ldr r4,=0xffffffff |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 693 | str r4,[r7,#SIO_DIV_UDIVIDEND_OFFSET] |
| 694 | lsrs r6,r1,#16 |
| 695 | wait_div 1 |
| 696 | ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ r=0xffffffffU/x1; 2^15<=r<2^16 r is a normalised reciprocal of x, guaranteed not an overestimate |
| 697 | |
| 698 | @ here |
| 699 | @ r0:r1 y |
| 700 | @ r2:r3 x0 |
| 701 | @ r4 r |
| 702 | @ r5 xsh 0<=xsh<16 |
| 703 | |
| 704 | muls r6,r4 |
| 705 | lsrs r6,#16 @ q=((ui32)(y>>48)*r)>>16; |
| 706 | lsls r7,r6,#13 |
| 707 | mov r14,r7 @ save q<<13 |
| 708 | uxth r7,r2 @ x0l |
| 709 | muls r7,r6 |
| 710 | subs r0,r7 |
| 711 | bcs 1f |
| 712 | subs r1,#1 |
| 713 | 1: |
| 714 | subs r0,r7 |
| 715 | bcs 1f |
| 716 | subs r1,#1 |
| 717 | 1: |
| 718 | uxth r7,r3 @ x0h |
| 719 | muls r7,r6 |
| 720 | subs r1,r7 |
| 721 | subs r1,r7 |
| 722 | lsrs r7,r2,#16 @ x0m |
| 723 | muls r7,r6 |
| 724 | lsls r6,r7,#17 |
| 725 | lsrs r7,#15 |
| 726 | subs r0,r6 |
| 727 | sbcs r1,r7 @ y-=((ui64)q*x0)<<1; |
| 728 | |
| 729 | lsrs r6,r1,#3 @ y>>35 |
| 730 | muls r6,r4 |
| 731 | lsrs r6,#16 @ q=((ui32)(y>>35)*r)>>16; |
| 732 | |
| 733 | cmp r5,#12 |
| 734 | blt last1 @ if(xsh<12) goto last1; |
| 735 | |
| 736 | add r14,r6 @ qu<<13+q |
| 737 | lsrs r2,#12 |
| 738 | lsls r7,r3,#20 |
| 739 | orrs r2,r7 |
| 740 | lsrs r3,#12 @ x0>>12 |
| 741 | |
| 742 | uxth r7,r2 @ x0l |
| 743 | muls r7,r6 |
| 744 | subs r0,r7 |
| 745 | bcs 1f |
| 746 | subs r1,#1 |
| 747 | 1: |
| 748 | uxth r7,r3 @ x0h |
| 749 | muls r7,r6 |
| 750 | subs r1,r7 |
| 751 | lsrs r7,r2,#16 @ x0m |
| 752 | muls r7,r6 |
| 753 | lsls r6,r7,#16 |
| 754 | lsrs r7,#16 |
| 755 | subs r0,r6 |
| 756 | sbcs r1,r7 @ y-=((ui64)q*x0)>>12 |
| 757 | |
| 758 | lsrs r6,r0,#22 |
| 759 | lsls r7,r1,#10 |
| 760 | orrs r6,r7 @ y>>22 |
| 761 | muls r6,r4 |
| 762 | movs r7,#41 |
| 763 | subs r7,r5 |
| 764 | lsrs r6,r7 @ q=((ui32)(y>>22)*r)>>(16+25-xsh) |
| 765 | |
| 766 | subs r5,#12 |
| 767 | mov r7,r14 |
| 768 | lsls r7,r5 |
| 769 | 2: |
| 770 | adds r7,r6 @ qu=(qu<<(xsh-12))+q |
| 771 | pop {r4,r5} @ recall x |
| 772 | |
| 773 | @ here |
| 774 | @ r0:r1 y |
| 775 | @ r4:r5 x |
| 776 | @ r6 q |
| 777 | @ r7 qu |
| 778 | |
| 779 | uxth r2,r4 |
| 780 | uxth r3,r5 |
| 781 | muls r2,r6 @ xlo*q |
| 782 | muls r3,r6 @ xhi*q |
| 783 | subs r0,r2 |
| 784 | sbcs r1,r3 |
| 785 | lsrs r2,r4,#16 |
| 786 | muls r2,r6 |
| 787 | lsrs r3,r2,#16 |
| 788 | lsls r2,#16 @ xm*q |
| 789 | subs r0,r2 |
| 790 | sbcs r1,r3 @ y-=(ui64)q*x |
| 791 | |
| 792 | 1: |
| 793 | movs r2,r0 |
| 794 | movs r3,r1 |
| 795 | adds r7,#1 |
| 796 | subs r0,r4 |
| 797 | sbcs r1,r5 @ while(y>=x) y-=x,qu++; |
| 798 | bhs 1b |
| 799 | subs r0,r7,#1 @ correction to qu |
| 800 | movs r1,#0 |
| 801 | pop {r4-r7,r15} |
| 802 | |
| 803 | last1: |
| 804 | @ r0:r1 y |
| 805 | @ r2:r3 x0 |
| 806 | @ r5 xsh |
| 807 | @ r6 q |
| 808 | |
| 809 | movs r7,#12 |
| 810 | subs r7,r5 |
| 811 | lsrs r6,r7 @ q>>=12-xsh |
| 812 | mov r7,r14 |
| 813 | lsrs r7,#13 |
| 814 | lsls r7,r5 |
| 815 | adds r7,r7 @ qu<<(xsh+1) |
| 816 | b 2b |
| 817 | |
| 818 | y64_x64: |
| 819 | @ here x is 49..64 bits |
| 820 | movs r4,#0 @ q=0 if x>>32==0xffffffff |
| 821 | adds r5,r3,#1 |
| 822 | beq 1f |
| 823 | |
Ravago Jones | d208ae7 | 2023-02-13 02:24:07 -0800 | [diff] [blame^] | 824 | ldr r7,=SIO_BASE |
Austin Schuh | 208337d | 2022-01-01 14:29:11 -0800 | [diff] [blame] | 825 | str r5,[r7,#SIO_DIV_UDIVISOR_OFFSET] |
| 826 | str r1,[r7,#SIO_DIV_UDIVIDEND_OFFSET] |
| 827 | wait_div 0 |
| 828 | ldr r4,[r7,#SIO_DIV_QUOTIENT_OFFSET] @ q=(ui32)(y>>32)/((x>>32)+1) |
| 829 | 1: |
| 830 | uxth r5,r2 |
| 831 | uxth r6,r3 |
| 832 | muls r5,r4 |
| 833 | muls r6,r4 |
| 834 | subs r0,r5 |
| 835 | sbcs r1,r6 |
| 836 | lsrs r5,r2,#16 |
| 837 | lsrs r6,r3,#16 |
| 838 | muls r5,r4 |
| 839 | muls r6,r4 |
| 840 | lsls r6,#16 |
| 841 | lsrs r7,r5,#16 |
| 842 | orrs r6,r7 |
| 843 | lsls r5,#16 |
| 844 | subs r0,r5 |
| 845 | sbcs r1,r6 @ y-=(ui64)q*x |
| 846 | |
| 847 | cmp r1,r3 @ while(y>=x) y-=x,q++ |
| 848 | bhs 1f |
| 849 | 3: |
| 850 | movs r2,r0 |
| 851 | movs r3,r1 |
| 852 | movs r0,r4 |
| 853 | movs r1,#0 |
| 854 | pop {r4-r7,r15} |
| 855 | |
| 856 | 1: |
| 857 | bne 2f |
| 858 | cmp r0,r2 |
| 859 | blo 3b |
| 860 | 2: |
| 861 | subs r0,r2 |
| 862 | sbcs r1,r3 |
| 863 | adds r4,#1 |
| 864 | cmp r1,r3 |
| 865 | blo 3b |
| 866 | b 1b |
| 867 | |
| 868 | div_section divmod_s64s64_rem |
| 869 | regular_func divmod_s64s64_rem |
| 870 | push {r4, lr} |
| 871 | bl divmod_s64s64 |
| 872 | ldr r4, [sp, #8] |
| 873 | stmia r4!, {r2,r3} |
| 874 | pop {r4, pc} |
| 875 | |
| 876 | div_section divmod_u64u64_rem |
| 877 | regular_func divmod_u64u64_rem |
| 878 | push {r4, lr} |
| 879 | bl divmod_u64u64 |
| 880 | ldr r4, [sp, #8] |
| 881 | stmia r4!, {r2,r3} |
| 882 | pop {r4, pc} |