📄 lib1funcs.asm
字号:
mulu.l r7,r3,r5 add r8,r7,r8 sub r2,r3,r2 cmpgt r2,r5,r5 add r8,r5,r2 /* could test r3 here to check for divide by zero. */ blink tr0,r63LOCAL(large_divisor): mmulfx.w r5,r4,r4 shlrd r2,r9,r25 shlri r25,32,r8 msub.w r1,r4,r1 mulu.l r1,r7,r4 addi r1,-3,r5 mulu.l r5,r8,r5 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as the case may be, %0000000000000000 000.11111111111, still */ muls.l r1,r4,r4 /* leaving at least one sign bit. */ shlri r5,14-1,r8 mulu.l r8,r7,r5 mshalds.l r1,r21,r1 shari r4,26,r4 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) sub r25,r5,r25 /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ shlri r25,22,r21 mulu.l r21,r1,r21 pta LOCAL(no_lo_adj),tr0 addi r22,32,r0 shlri r21,40,r21 mulu.l r21,r7,r5 add r8,r21,r8 shlld r2,r0,r2 sub r25,r5,r25 bgtu/u r7,r25,tr0 // no_lo_adj addi r8,1,r8 sub r25,r7,r25LOCAL(no_lo_adj): mextr4 r2,r25,r2 /* large_divisor: only needs a few adjustments. */ mulu.l r8,r6,r5 ptabs r18,tr0 /* bubble */ cmpgtu r5,r2,r5 sub r8,r5,r2 blink tr0,r63 ENDFUNC(GLOBAL(udivdi3))/* Note 1: To shift the result of the second divide stage so that the result always fits into 32 bits, yet we still reduce the rest sufficiently would require a lot of instructions to do the shifts just right. Using the full 64 bit shift result to multiply with the divisor would require four extra instructions for the upper 32 bits (shift / mulu / shift / sub). Fortunately, if the upper 32 bits of the shift result are nonzero, we know that the rest after taking this partial result into account will fit into 32 bits. So we just clear the upper 32 bits of the rest if the upper 32 bits of the partial result are nonzero. */#endif /* __SHMEDIA__ */#endif /* L_udivdi3 */#ifdef L_divdi3#ifdef __SHMEDIA__ .mode SHmedia .section .text..SHmedia32,"ax" .align 2 .global GLOBAL(divdi3) FUNC(GLOBAL(divdi3))GLOBAL(divdi3): pta GLOBAL(udivdi3),tr0 shari r2,63,r22 shari r3,63,r23 xor r2,r22,r2 xor r3,r23,r3 sub r2,r22,r2 sub r3,r23,r3 beq/u r22,r23,tr0 ptabs r18,tr1 blink tr0,r18 sub r63,r2,r2 blink tr1,r63 ENDFUNC(GLOBAL(divdi3))#endif /* __SHMEDIA__ */#endif /* L_divdi3 */#ifdef L_umoddi3#ifdef __SHMEDIA__ .mode SHmedia .section .text..SHmedia32,"ax" .align 2 .global GLOBAL(umoddi3) FUNC(GLOBAL(umoddi3))GLOBAL(umoddi3): shlri r3,1,r4 nsb r4,r22 shlld r3,r22,r6 shlri r6,49,r5 movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ sub r21,r5,r1 mmulfx.w r1,r1,r4 mshflo.w r1,r63,r1 sub r63,r22,r20 // r63 == 64 % 64 mmulfx.w r5,r4,r4 pta LOCAL(large_divisor),tr0 addi r20,32,r9 msub.w r1,r4,r1 madd.w r1,r1,r1 mmulfx.w r1,r1,r4 shlri r6,32,r7 bgt/u r9,r63,tr0 // large_divisor mmulfx.w r5,r4,r4 shlri r2,32+14,r19 addi r22,-31,r0 msub.w r1,r4,r1 mulu.l r1,r7,r4 addi r1,-3,r5 mulu.l r5,r19,r5 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as the case may be, %0000000000000000 000.11111111111, still */ muls.l r1,r4,r4 /* leaving at least one sign bit. */ mulu.l r5,r3,r5 mshalds.l r1,r21,r1 shari r4,26,r4 shlld r5,r0,r5 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) sub r2,r5,r2 /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ shlri r2,22,r21 mulu.l r21,r1,r21 addi r20,30-22,r0 /* bubble */ /* could test r3 here to check for divide by zero. */ shlrd r21,r0,r21 mulu.l r21,r3,r5 mcmpgt.l r21,r63,r21 // See Note 1 addi r20,30,r0 mshfhi.l r63,r21,r21 sub r2,r5,r2 andc r2,r21,r2 /* small divisor: need a third divide step */ mulu.l r2,r1,r7 ptabs r18,tr0 sub r2,r3,r8 /* re-use r8 here for rest - r3 */ shlrd r7,r0,r7 mulu.l r7,r3,r5 /* bubble */ addi r8,1,r7 cmpgt r7,r5,r7 cmvne r7,r8,r2 sub r2,r5,r2 blink tr0,r63LOCAL(large_divisor): mmulfx.w r5,r4,r4 shlrd r2,r9,r25 shlri r25,32,r8 msub.w r1,r4,r1 mulu.l r1,r7,r4 addi r1,-3,r5 mulu.l r5,r8,r5 sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as the case may be, %0000000000000000 000.11111111111, still */ muls.l r1,r4,r4 /* leaving at least one sign bit. */ shlri r5,14-1,r8 mulu.l r8,r7,r5 mshalds.l r1,r21,r1 shari r4,26,r4 add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) sub r25,r5,r25 /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ shlri r25,22,r21 mulu.l r21,r1,r21 pta LOCAL(no_lo_adj),tr0 addi r22,32,r0 shlri r21,40,r21 mulu.l r21,r7,r5 add r8,r21,r8 shlld r2,r0,r2 sub r25,r5,r25 bgtu/u r7,r25,tr0 // no_lo_adj addi r8,1,r8 sub r25,r7,r25LOCAL(no_lo_adj): mextr4 r2,r25,r2 /* large_divisor: only needs a few adjustments. */ mulu.l r8,r6,r5 ptabs r18,tr0 add r2,r6,r7 cmpgtu r5,r2,r8 cmvne r8,r7,r2 sub r2,r5,r2 shlrd r2,r22,r2 blink tr0,r63 ENDFUNC(GLOBAL(umoddi3))/* Note 1: To shift the result of the second divide stage so that the result always fits into 32 bits, yet we still reduce the rest sufficiently would require a lot of instructions to do the shifts just right. Using the full 64 bit shift result to multiply with the divisor would require four extra instructions for the upper 32 bits (shift / mulu / shift / sub). Fortunately, if the upper 32 bits of the shift result are nonzero, we know that the rest after taking this partial result into account will fit into 32 bits. So we just clear the upper 32 bits of the rest if the upper 32 bits of the partial result are nonzero. */#endif /* __SHMEDIA__ */#endif /* L_umoddi3 */#ifdef L_moddi3#ifdef __SHMEDIA__ .mode SHmedia .section .text..SHmedia32,"ax" .align 2 .global GLOBAL(moddi3) FUNC(GLOBAL(moddi3))GLOBAL(moddi3): pta GLOBAL(umoddi3),tr0 shari r2,63,r22 shari r3,63,r23 xor r2,r22,r2 xor r3,r23,r3 sub r2,r22,r2 sub r3,r23,r3 beq/u r22,r63,tr0 ptabs r18,tr1 blink tr0,r18 sub r63,r2,r2 blink tr1,r63 ENDFUNC(GLOBAL(moddi3))#endif /* __SHMEDIA__ */#endif /* L_moddi3 */#ifdef L_set_fpscr#if !defined (__SH2A_NOFPU__)#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32#ifdef __SH5__ .mode SHcompact#endif .global GLOBAL(set_fpscr) FUNC(GLOBAL(set_fpscr))GLOBAL(set_fpscr): lds r4,fpscr#ifdef __PIC__ mov.l r12,@-r15 mova LOCAL(set_fpscr_L0),r0 mov.l LOCAL(set_fpscr_L0),r12 add r0,r12 mov.l LOCAL(set_fpscr_L1),r0 mov.l @(r0,r12),r1 mov.l @r15+,r12#else mov.l LOCAL(set_fpscr_L1),r1#endif swap.w r4,r0 or #24,r0#ifndef FMOVD_WORKS xor #16,r0#endif#if defined(__SH4__) || defined (__SH2A_DOUBLE__) swap.w r0,r3 mov.l r3,@(4,r1)#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ swap.w r0,r2 mov.l r2,@r1#endif#ifndef FMOVD_WORKS xor #8,r0#else xor #24,r0#endif#if defined(__SH4__) || defined (__SH2A_DOUBLE__) swap.w r0,r2 rts mov.l r2,@r1#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */ swap.w r0,r3 rts mov.l r3,@(4,r1)#endif .align 2#ifdef __PIC__LOCAL(set_fpscr_L0): .long _GLOBAL_OFFSET_TABLE_LOCAL(set_fpscr_L1): .long GLOBAL(fpscr_values@GOT)#elseLOCAL(set_fpscr_L1): .long GLOBAL(fpscr_values)#endif ENDFUNC(GLOBAL(set_fpscr))#ifndef NO_FPSCR_VALUES#ifdef __ELF__ .comm GLOBAL(fpscr_values),8,4#else .comm GLOBAL(fpscr_values),8#endif /* ELF */#endif /* NO_FPSCR_VALUES */#endif /* SH2E / SH3E / SH4 */#endif /* __SH2A_NOFPU__ */#endif /* L_set_fpscr */#ifdef L_ic_invalidate#if __SH5__ == 32 .mode SHmedia .section .text..SHmedia32,"ax" .align 2 .global GLOBAL(init_trampoline) FUNC(GLOBAL(init_trampoline))GLOBAL(init_trampoline): st.l r0,8,r2#ifdef __LITTLE_ENDIAN__ movi 9,r20 shori 0x402b,r20 shori 0xd101,r20 shori 0xd002,r20#else movi 0xffffffffffffd002,r20 shori 0xd101,r20 shori 0x402b,r20 shori 9,r20#endif st.q r0,0,r20 st.l r0,12,r3 .global GLOBAL(ic_invalidate) FUNC(GLOBAL(ic_invalidate))GLOBAL(ic_invalidate): ocbwb r0,0 synco icbi r0, 0 ptabs r18, tr0 synci blink tr0, r63 ENDFUNC(GLOBAL(ic_invalidate)) ENDFUNC(GLOBAL(init_trampoline))#elif defined(__SH4A__) .global GLOBAL(ic_invalidate) FUNC(GLOBAL(ic_invalidate))GLOBAL(ic_invalidate): ocbwb @r4 synco rts icbi @r4 ENDFUNC(GLOBAL(ic_invalidate))#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) /* This assumes a direct-mapped cache, which is the case for the first SH4, but not for the second version of SH4, that uses a 2-way set-associative cache, nor SH4a, that is 4-way. SH4a fortunately offers an instruction to invalidate the instruction cache, and we use it above, but SH4 doesn't. However, since the libraries don't contain any nested functions (the only case in which GCC would emit this pattern) and we actually emit the ic_invalidate_line_i pattern for cache invalidation on all SH4 multilibs (even 4-nofpu, that isn't even corevered here), and pre-SH4 cores don't have caches, it seems like this code is pointless, unless it's meant for backward binary compatibility or for userland-only cache invalidation for say sh4-*-linux-gnu. Such a feature should probably be moved into a system call, such that the kernel could do whatever it takes to invalidate a cache line on the core it's actually running on. I.e., this hideous :-) piece of code should go away at some point. */ .global GLOBAL(ic_invalidate) FUNC(GLOBAL(ic_invalidate))GLOBAL(ic_invalidate): ocbwb @r4 mova 0f,r0 mov.w 1f,r1/* Compute how many cache lines 0f is away from r4. */ sub r0,r4 and r1,r4/* Prepare to branch to 0f plus the cache-line offset. */ add # 0f - 1f,r4 braf r4 nop1: .short 0x1fe0 .p2align 5/* This must be aligned to the beginning of a cache line. */0: .rept 256 /* There are 256 cache lines of 32 bytes. */ rts .rept 15 nop .endr .endr ENDFUNC(GLOBAL(ic_invalidate))#endif /* SH4 */#endif /* L_ic_invalidate */#if defined (__SH5__) && __SH5__ == 32#ifdef L_shcompact_call_trampoline .section .rodata .align 1LOCAL(ct_main_table):.word LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label).word LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label).word LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label).word LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label).word LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label) .mode SHmedia .section .text..SHmedia32, "ax" .align 2 /* This function loads 64-bit general-purpose registers from the stack, from a memory address contained in them or from an FP register, according to a cookie passed in r1. Its execution time is linear on the number of registers that actually have to be copied. See sh.h for details on the actual bit pattern. The function to be called is passed in r0. If a 32-bit return value is expected, the actual function will be tail-called, otherwise the return address will be stored in r10 (that the caller should expect to be clobbered) and the return value will be expanded into r2/r3 upon return. */ .global GLOBAL(GCC_shcompact_call_trampoline) FUNC(GLOBAL(GCC_shcompact_call_trampoline))GLOBAL(GCC_shcompact_call_trampoline): ptabs/l r0, tr0 /* Prepare to call the actual function. */ movi ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0 pt/l LOCAL(ct_loop), tr1 addz.l r1, r63, r1 shori ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0LOCAL(ct_loop): nsb r1, r28 shlli r28, 1, r29 ldx.w r0, r29, r30LOCAL(ct_main_label): ptrel/l r30, tr2 blink tr2, r63LOCAL(ct_r2_fp): /* Copy r2 from an FP register. */ /* It must be dr0, so just do it. */ fmov.dq dr0, r2 movi 7, r30 shlli r30, 29, r31 andc r1, r31, r1 blink tr1, r63LOCAL(ct_r3_fp): /* Copy r3 from an FP register. */ /* It is either dr0 or dr2. */ movi 7, r30 shlri r1, 26, r32 shlli r30, 26, r31 andc r1, r31, r1 fmov.dq dr0, r3 beqi/l r32, 4, tr1 fmov.dq dr2, r3 blink tr1, r63LOCAL(ct_r4_fp): /* Copy r4 from an FP register. */ shlri r1, 23 - 3, r34 andi r34, 3 << 3, r33 addi r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32LOCAL(ct_r4_fp_base): ptrel/l r32, tr2 movi 7, r30 shlli r30, 23, r31 andc r1, r31, r1 blink tr2, r63LOCAL(ct_r4_fp_copy): fmov.dq dr0, r4 blink tr1, r63 fmov.dq dr2, r4 blink tr1, r63 fmov.dq dr4, r4 blink tr1, r63LOCAL(ct_r5_fp): /* Copy r5 from an FP register. */ shlri r1, 20 - 3, r34 andi r34, 3 << 3, r33 addi r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32LOCAL(ct_r5_fp_base): ptrel/l r32, tr2 movi 7, r30 shlli r30, 20, r31 andc r1, r31, r1 blink tr2, r63LOCAL(ct_r5_fp_copy): fmov.dq dr0, r5 blink tr1, r63 fmov.dq dr2, r5 blink tr1, r63 fmov.dq dr4, r5 blink tr1, r63 fmov.dq dr6, r5 blink tr1, r63LOCAL(ct_r6_fph): /* Copy r6 from a high FP register. */ /* It must be dr8. */ fmov.dq dr8, r6 movi 15, r30 shlli r30, 16, r31 andc r1, r31, r1 blink tr1, r63LOCAL(ct_r6_fpl): /* Copy r6 from a low FP register. */ shlri r1, 16 - 3, r34 andi r34, 3 << 3, r33 addi r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32LOCAL(ct_r6_fp_base): ptrel/l r32, tr2 movi 7, r30 shlli r30, 16, r31 andc r1, r31, r1 blink tr2, r63LOCAL(ct_r6_fp_copy): fmov.dq dr0, r6 blink tr1, r63 fmov.dq dr2, r6 blink tr1, r63 fmov.dq dr4, r6 blink tr1, r63 fmov.dq dr6, r6 blink tr1, r63LOCAL(ct_r7_fph): /* Copy r7 from a high FP register. */ /* It is either dr8 or dr10. */ movi 15 << 12, r31 shlri r1, 12, r32 andc r1, r31, r1 fmov.dq dr8, r7 beqi/l r32, 8, tr1 fmov.dq dr10, r7 blink tr1, r63LOCAL(ct_r7_fpl): /* Copy r7 from a low FP register. */ shlri r1, 12 - 3, r34 andi r34, 3 << 3, r33 addi r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32LOCAL(ct_r7_fp_base): ptrel/l r32, tr2 movi 7 << 12, r31 andc r1, r31, r1 blink tr2, r63LOCAL(ct_r7_fp_copy): fmov.dq dr0, r7 blink tr1, r63 fmov.dq dr2, r7 blink tr1, r63 fmov.dq dr4, r7 blink tr1, r63 fmov.dq dr6, r7 blink tr1, r63LOCAL(ct_r8_fph): /* Copy r8 from a high FP register. */ /* It is either dr8 or dr10. */ movi 15 << 8, r31 andi r1, 1 << 8, r32
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -