📄 lib1funcs.asm

📁 linux下的gcc编译器
💻 ASM
📖 第 1 页 / 共 4 页
字号:
	movi	0, r0	movi	1, r18	addz.l	r5, r63, r19	addz.l	r4, r63, r4	shlli	r19, 31, r19	shlli	r18, 31, r18LOCAL(udivsi3_loop):	bgtu	r19, r4, tr2	or	r0, r18, r0	sub	r4, r19, r4LOCAL(udivsi3_dontadd):	shlri	r18, 1, r18	shlri	r19, 1, r19	bnei	r18, 0, tr1	blink	tr0, r63#elseGLOBAL(udivsi3): // inputs: r4,r5 // clobbered: r18,r19,r20,r21,r22,r25,tr0 // result in r0. addz.l r5,r63,r22 nsb r22,r0 shlld r22,r0,r25 shlri r25,48,r25 movi 0xffffffffffffbb0c,r20 // shift count eqiv 76 sub r20,r25,r21 mmulfx.w r21,r21,r19 mshflo.w r21,r63,r21 ptabs r18,tr0 mmulfx.w r25,r19,r19 sub r20,r0,r0 /* bubble */ msub.w r21,r19,r19 addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21		    before the msub.w, but we need a different value for		    r19 to keep errors under control.  */ mulu.l r4,r21,r18 mmulfx.w r19,r19,r19 shlli r21,15,r21 shlrd r18,r0,r18 mulu.l r18,r22,r20 mmacnfx.wl r25,r19,r21 /* bubble */ sub r4,r20,r25 mulu.l r25,r21,r19 addi r0,14,r0 /* bubble */ shlrd r19,r0,r19 mulu.l r19,r22,r20 add r18,r19,r18 /* bubble */ sub.l r25,r20,r25 mulu.l r25,r21,r19 addz.l r25,r63,r25 sub r25,r22,r25 shlrd r19,r0,r19 mulu.l r19,r22,r20 addi r25,1,r25 add r18,r19,r18 cmpgt r25,r20,r25 add.l r18,r25,r0 blink tr0,r63#endif#elif defined (__SHMEDIA__)/* m5compact-nofpu - more emphasis on code size than on speed, but don't   ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.   So use a short shmedia loop.  */ // clobbered: r20,r21,r25,tr0,tr1,tr2	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2GLOBAL(udivsi3): pt/l LOCAL(udivsi3_dontsub), tr0 pt/l LOCAL(udivsi3_loop), tr1 ptabs/l r18,tr2 shlli r5,32,r25 addi r25,-1,r21 addz.l r4,r63,r20LOCAL(udivsi3_loop): shlli r20,1,r20 bgeu/u r21,r20,tr0 sub r20,r21,r20LOCAL(udivsi3_dontsub): addi.l r25,-1,r25 bnei r25,-32,tr1 add.l r20,r63,r0 blink tr2,r63#else /* ! defined (__SHMEDIA__) */LOCAL(div8): div1 r5,r4LOCAL(div7): div1 r5,r4; div1 r5,r4; div1 r5,r4 div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4LOCAL(divx4): div1 r5,r4; rotcl r0 div1 r5,r4; rotcl r0 div1 r5,r4; rotcl r0 rts; div1 r5,r4GLOBAL(udivsi3): sts.l pr,@-r15 extu.w r5,r0 cmp/eq r5,r0#ifdef __sh1__ bf LOCAL(large_divisor)#else bf/s LOCAL(large_divisor)#endif div0u swap.w r4,r0 shlr16 r4 bsr LOCAL(div8) shll16 r5 bsr LOCAL(div7) div1 r5,r4 xtrct r4,r0 xtrct r0,r4 bsr LOCAL(div8) swap.w r4,r4 bsr LOCAL(div7) div1 r5,r4 lds.l @r15+,pr xtrct r4,r0 swap.w r0,r0 rotcl r0 rts shlr16 r5LOCAL(large_divisor):#ifdef __sh1__ div0u#endif mov #0,r0 xtrct r4,r0 xtrct r0,r4 bsr LOCAL(divx4) rotcl r0 bsr LOCAL(divx4) rotcl r0 bsr LOCAL(divx4) rotcl r0 bsr LOCAL(divx4) rotcl r0 lds.l @r15+,pr rts rotcl r0#endif /* ! __SHMEDIA__ */#endif /* __SH4__ */#endif /* L_udivsi3 */#ifdef L_udivdi3#ifdef __SHMEDIA__	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2	.global	GLOBAL(udivdi3)GLOBAL(udivdi3):	shlri r3,1,r4	nsb r4,r22	shlld r3,r22,r6	shlri r6,49,r5	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */	sub r21,r5,r1	mmulfx.w r1,r1,r4	mshflo.w r1,r63,r1	sub r63,r22,r20 // r63 == 64 % 64	mmulfx.w r5,r4,r4	pta LOCAL(large_divisor),tr0	addi r20,32,r9	msub.w r1,r4,r1	madd.w r1,r1,r1	mmulfx.w r1,r1,r4	shlri r6,32,r7	bgt/u r9,r63,tr0 // large_divisor	mmulfx.w r5,r4,r4	shlri r2,32+14,r19	addi r22,-31,r0	msub.w r1,r4,r1	mulu.l r1,r7,r4	addi r1,-3,r5	mulu.l r5,r19,r5	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as	                 the case may be, %0000000000000000 000.11111111111, still */	muls.l r1,r4,r4 /* leaving at least one sign bit.  */	mulu.l r5,r3,r8	mshalds.l r1,r21,r1	shari r4,26,r4	shlld r8,r0,r8	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)	sub r2,r8,r2	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */	shlri r2,22,r21	mulu.l r21,r1,r21	shlld r5,r0,r8	addi r20,30-22,r0	shlrd r21,r0,r21	mulu.l r21,r3,r5	add r8,r21,r8	mcmpgt.l r21,r63,r21 // See Note 1	addi r20,30,r0	mshfhi.l r63,r21,r21	sub r2,r5,r2	andc r2,r21,r2	/* small divisor: need a third divide step */	mulu.l r2,r1,r7	ptabs r18,tr0	addi r2,1,r2	shlrd r7,r0,r7	mulu.l r7,r3,r5	add r8,r7,r8	sub r2,r3,r2	cmpgt r2,r5,r5	add r8,r5,r2	/* could test r3 here to check for divide by zero.  */	blink tr0,r63LOCAL(large_divisor):	mmulfx.w r5,r4,r4	shlrd r2,r9,r25	shlri r25,32,r8	msub.w r1,r4,r1	mulu.l r1,r7,r4	addi r1,-3,r5	mulu.l r5,r8,r5	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as	                 the case may be, %0000000000000000 000.11111111111, still */	muls.l r1,r4,r4 /* leaving at least one sign bit.  */	shlri r5,14-1,r8	mulu.l r8,r7,r5	mshalds.l r1,r21,r1	shari r4,26,r4	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)	sub r25,r5,r25	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */	shlri r25,22,r21	mulu.l r21,r1,r21	pta LOCAL(no_lo_adj),tr0	addi r22,32,r0	shlri r21,40,r21	mulu.l r21,r7,r5	add r8,r21,r8	shlld r2,r0,r2	sub r25,r5,r25	bgtu/u r7,r25,tr0 // no_lo_adj	addi r8,1,r8	sub r25,r7,r25LOCAL(no_lo_adj):	mextr4 r2,r25,r2	/* large_divisor: only needs a few adjustments.  */	mulu.l r8,r6,r5	ptabs r18,tr0	/* bubble */	cmpgtu r5,r2,r5	sub r8,r5,r2	blink tr0,r63/* Note 1: To shift the result of the second divide stage so that the result   always fits into 32 bits, yet we still reduce the rest sufficiently   would require a lot of instructions to do the shifts just right.  Using   the full 64 bit shift result to multiply with the divisor would require   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).   Fortunately, if the upper 32 bits of the shift result are nonzero, we   know that the rest after taking this partial result into account will   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the   upper 32 bits of the partial result are nonzero.  */#endif /* __SHMEDIA__ */#endif /* L_udivdi3 */#ifdef L_divdi3#ifdef __SHMEDIA__	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2	.global	GLOBAL(divdi3)GLOBAL(divdi3):	pta GLOBAL(udivdi3),tr0	shari r2,63,r22	shari r3,63,r23	xor r2,r22,r2	xor r3,r23,r3	sub r2,r22,r2	sub r3,r23,r3	beq/u r22,r23,tr0	ptabs r18,tr1	blink tr0,r18	sub r63,r2,r2	blink tr1,r63#endif /* __SHMEDIA__ */#endif /* L_divdi3 */#ifdef L_umoddi3#ifdef __SHMEDIA__	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2	.global	GLOBAL(umoddi3)GLOBAL(umoddi3):	shlri r3,1,r4	nsb r4,r22	shlld r3,r22,r6	shlri r6,49,r5	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */	sub r21,r5,r1	mmulfx.w r1,r1,r4	mshflo.w r1,r63,r1	sub r63,r22,r20 // r63 == 64 % 64	mmulfx.w r5,r4,r4	pta LOCAL(large_divisor),tr0	addi r20,32,r9	msub.w r1,r4,r1	madd.w r1,r1,r1	mmulfx.w r1,r1,r4	shlri r6,32,r7	bgt/u r9,r63,tr0 // large_divisor	mmulfx.w r5,r4,r4	shlri r2,32+14,r19	addi r22,-31,r0	msub.w r1,r4,r1	mulu.l r1,r7,r4	addi r1,-3,r5	mulu.l r5,r19,r5	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as	                 the case may be, %0000000000000000 000.11111111111, still */	muls.l r1,r4,r4 /* leaving at least one sign bit.  */	mulu.l r5,r3,r5	mshalds.l r1,r21,r1	shari r4,26,r4	shlld r5,r0,r5	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)	sub r2,r5,r2	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */	shlri r2,22,r21	mulu.l r21,r1,r21	addi r20,30-22,r0	/* bubble */ /* could test r3 here to check for divide by zero.  */	shlrd r21,r0,r21	mulu.l r21,r3,r5	mcmpgt.l r21,r63,r21 // See Note 1	addi r20,30,r0	mshfhi.l r63,r21,r21	sub r2,r5,r2	andc r2,r21,r2	/* small divisor: need a third divide step */	mulu.l r2,r1,r7	ptabs r18,tr0	sub r2,r3,r8 /* re-use r8 here for rest - r3 */	shlrd r7,r0,r7	mulu.l r7,r3,r5	/* bubble */	addi r8,1,r7	cmpgt r7,r5,r7	cmvne r7,r8,r2	sub r2,r5,r2	blink tr0,r63LOCAL(large_divisor):	mmulfx.w r5,r4,r4	shlrd r2,r9,r25	shlri r25,32,r8	msub.w r1,r4,r1	mulu.l r1,r7,r4	addi r1,-3,r5	mulu.l r5,r8,r5	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as	                 the case may be, %0000000000000000 000.11111111111, still */	muls.l r1,r4,r4 /* leaving at least one sign bit.  */	shlri r5,14-1,r8	mulu.l r8,r7,r5	mshalds.l r1,r21,r1	shari r4,26,r4	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)	sub r25,r5,r25	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */	shlri r25,22,r21	mulu.l r21,r1,r21	pta LOCAL(no_lo_adj),tr0	addi r22,32,r0	shlri r21,40,r21	mulu.l r21,r7,r5	add r8,r21,r8	shlld r2,r0,r2	sub r25,r5,r25	bgtu/u r7,r25,tr0 // no_lo_adj	addi r8,1,r8	sub r25,r7,r25LOCAL(no_lo_adj):	mextr4 r2,r25,r2	/* large_divisor: only needs a few adjustments.  */	mulu.l r8,r6,r5	ptabs r18,tr0	add r2,r6,r7	cmpgtu r5,r2,r8	cmvne r8,r7,r2	sub r2,r5,r2	shlrd r2,r22,r2	blink tr0,r63/* Note 1: To shift the result of the second divide stage so that the result   always fits into 32 bits, yet we still reduce the rest sufficiently   would require a lot of instructions to do the shifts just right.  Using   the full 64 bit shift result to multiply with the divisor would require   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).   Fortunately, if the upper 32 bits of the shift result are nonzero, we   know that the rest after taking this partial result into account will   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the   upper 32 bits of the partial result are nonzero.  */#endif /* __SHMEDIA__ */#endif /* L_umoddi3 */#ifdef L_moddi3#ifdef __SHMEDIA__	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2	.global	GLOBAL(moddi3)GLOBAL(moddi3):	pta GLOBAL(umoddi3),tr0	shari r2,63,r22	shari r3,63,r23	xor r2,r22,r2	xor r3,r23,r3	sub r2,r22,r2	sub r3,r23,r3	beq/u r22,r63,tr0	ptabs r18,tr1	blink tr0,r18	sub r63,r2,r2	blink tr1,r63#endif /* __SHMEDIA__ */#endif /* L_moddi3 */#ifdef L_set_fpscr#if defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32#ifdef __SH5__	.mode	SHcompact#endif	.global GLOBAL(set_fpscr)GLOBAL(set_fpscr):	lds r4,fpscr	mov.l LOCAL(set_fpscr_L1),r1	swap.w r4,r0	or #24,r0#ifndef FMOVD_WORKS	xor #16,r0#endif#if defined(__SH4__)	swap.w r0,r3	mov.l r3,@(4,r1)#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */	swap.w r0,r2	mov.l r2,@r1#endif#ifndef FMOVD_WORKS	xor #8,r0#else	xor #24,r0#endif#if defined(__SH4__)	swap.w r0,r2	rts	mov.l r2,@r1#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */	swap.w r0,r3	rts	mov.l r3,@(4,r1)#endif	.align 2LOCAL(set_fpscr_L1):	.long GLOBAL(fpscr_values)#ifdef __ELF__        .comm   GLOBAL(fpscr_values),8,4#else        .comm   GLOBAL(fpscr_values),8#endif /* ELF */#endif /* SH3E / SH4 */#endif /* L_set_fpscr */#ifdef L_ic_invalidate#if __SH5__ == 32	.mode	SHmedia	.section	.text..SHmedia32,"ax"	.align	2	.global	GLOBAL(init_trampoline)GLOBAL(init_trampoline):	st.l	r0,8,r2#ifdef __LITTLE_ENDIAN__	movi	9,r20	shori	0x402b,r20	shori	0xd101,r20	shori	0xd002,r20#else	movi	0xffffffffffffd002,r20	shori	0xd101,r20	shori	0x402b,r20	shori	9,r20#endif	st.q	r0,0,r20	st.l	r0,12,r3	.global	GLOBAL(ic_invalidate)GLOBAL(ic_invalidate):	ocbwb	r0,0	synco	icbi	r0, 0	ptabs	r18, tr0	synci	blink	tr0, r63#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__)	.global GLOBAL(ic_invalidate)GLOBAL(ic_invalidate):	ocbwb	@r4	mova	0f,r0	mov.w	1f,r1/* Compute how many cache lines 0f is away from r4.  */	sub	r0,r4	and	r1,r4/* Prepare to branch to 0f plus the cache-line offset.  */	add	# 0f - 1f,r4	braf	r4	nop1:	.short	0x1fe0	.p2align 5/* This must be aligned to the beginning of a cache line.  */0:	.rept	256 /* There are 256 cache lines of 32 bytes.  */	rts	.rept	15	nop	.endr	.endr#endif /* SH4 */#endif /* L_ic_invalidate */#if defined (__SH5__) && __SH5__ == 32#ifdef L_shcompact_call_trampoline	.section	.rodata	.align	1LOCAL(ct_main_table):.word	LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label).word	LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label)	.mode	SHmedia	.section	.text..SHmedia32, "ax"	.align	2	     /* This function loads 64-bit general-purpose registers from the	stack, from a memory address contained in them or from an FP	register, according to a cookie passed in r1.  Its execution	time is linear on the number of registers that actually have	to be copied.  See sh.h for details on the actual bit pattern.	The function to be called is passed in r0.  If a 32-bit return	value is expected, the actual function will be tail-called,	otherwise the return address will be stored in r10 (that the	caller should expect to be clobbered) and the return value	will be expanded into r2/r3 upon return.  */		.global	GLOBAL(GCC_shcompact_call_trampoline)GLOBAL(GCC_shcompact_call_trampoline):	ptabs/l	r0, tr0	/* Prepare to call the actual function.  */	movi	((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0	pt/l	LOCAL(ct_loop), tr1	addz.l	r1, r63, r1	shori	((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0LOCAL(ct_loop):	nsb	r1, r28	shlli	r28, 1, r29	ldx.w	r0, r29, r30LOCAL(ct_main_label):	ptrel/l	r30, tr2	blink	tr2, r63LOCAL(ct_r2_fp):	/* Copy r2 from an FP register.  */	/* It must be dr0, so just do it.  */	fmov.dq	dr0, r2	movi	7, r30	shlli	r30, 29, r31	andc	r1, r31, r1	blink	tr1, r63LOCAL(ct_r3_fp):	/* Copy r3 from an FP register.  */	/* It is either dr0 or dr2.  */	movi	7, r30	shlri	r1, 26, r32	shlli	r30, 26, r31	andc	r1, r31, r1	fmov.dq	dr0, r3	beqi/l	r32, 4, tr1	fmov.dq	dr2, r3	blink	tr1, r63LOCAL(ct_r4_fp):	/* Copy r4 from an FP register.  */	shlri	r1, 23 - 3, r34	andi	r34, 3 << 3, r33	addi	r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32LOCAL(ct_r4_fp_base):	ptrel/l	r32, tr2	movi	7, r30	shlli	r30, 23, r31	andc	r1, r31, r1	blink	tr2, r63LOCAL(ct_r4_fp_copy):	fmov.dq	dr0, r4	blink	tr1, r63	fmov.dq	dr2, r4	blink	tr1, r63	fmov.dq	dr4, r4	blink	tr1, r63LOCAL(ct_r5_fp):	/* Copy r5 from an FP register.  */	shlri	r1, 20 - 3, r34	andi	r34, 3 << 3, r33	addi	r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32LOCAL(ct_r5_fp_base):	ptrel/l	r32, tr2	movi	7, r30	shlli	r30, 20, r31	andc	r1, r31, r1	blink	tr2, r63LOCAL(ct_r5_fp_copy):	fmov.dq	dr0, r5	blink	tr1, r63	fmov.dq	dr2, r5	blink	tr1, r63
💿 文件大小 30755 K
👤 上传用户 xfzzp_0321
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#linux #gcc #编译器
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -