lib1funcs.asm

来自「linux下的gcc编译器」· 汇编代码 · 共 704 行
ASM
704 行
#ifdef L__divtf3// Compute a 80-bit IEEE double-extended quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend.  farg1 holds the divisor.	.text	.align 16	.global __divtf3	.proc __divtf3__divtf3:	cmp.eq p7, p0 = r0, r0	frcpa.s0 f10, p6 = farg0, farg1	;;(p6)	cmp.ne p7, p0 = r0, r0	.pred.rel.mutex p6, p7(p6)	fnma.s1 f11 = farg1, f10, f1(p6)	fma.s1 f12 = farg0, f10, f0	;;(p6)	fma.s1 f13 = f11, f11, f0(p6)	fma.s1 f14 = f11, f11, f11	;;(p6)	fma.s1 f11 = f13, f13, f11(p6)	fma.s1 f13 = f14, f10, f10	;;(p6)	fma.s1 f10 = f13, f11, f10(p6)	fnma.s1 f11 = farg1, f12, farg0	;;(p6)	fma.s1 f11 = f11, f10, f12(p6)	fnma.s1 f12 = farg1, f10, f1	;;(p6)	fma.s1 f10 = f12, f10, f10(p6)	fnma.s1 f12 = farg1, f11, farg0	;;(p6)	fma.s0 fret0 = f12, f10, f11(p7)	mov fret0 = f10	br.ret.sptk rp	.endp __divtf3#endif#ifdef L__divdf3// Compute a 64-bit IEEE double quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend.  farg1 holds the divisor.	.text	.align 16	.global __divdf3	.proc __divdf3__divdf3:	cmp.eq p7, p0 = r0, r0	frcpa.s0 f10, p6 = farg0, farg1	;;(p6)	cmp.ne p7, p0 = r0, r0	.pred.rel.mutex p6, p7(p6)	fmpy.s1 f11 = farg0, f10(p6)	fnma.s1 f12 = farg1, f10, f1	;;(p6)	fma.s1 f11 = f12, f11, f11(p6)	fmpy.s1 f13 = f12, f12	;;(p6)	fma.s1 f10 = f12, f10, f10(p6)	fma.s1 f11 = f13, f11, f11	;;(p6)	fmpy.s1 f12 = f13, f13(p6)	fma.s1 f10 = f13, f10, f10	;;(p6)	fma.d.s1 f11 = f12, f11, f11(p6)	fma.s1 f10 = f12, f10, f10	;;(p6)	fnma.d.s1 f8 = farg1, f11, farg0	;;(p6)	fma.d fret0 = f8, f10, f11(p7)	mov fret0 = f10	br.ret.sptk rp	;;	.endp __divdf3#endif#ifdef L__divsf3// Compute a 32-bit IEEE float quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend.  farg1 holds the divisor.	.text	.align 16	.global __divsf3	.proc __divsf3__divsf3:	cmp.eq p7, p0 = r0, r0	frcpa.s0 f10, p6 = farg0, farg1	;;(p6)	cmp.ne p7, p0 = r0, r0	.pred.rel.mutex p6, p7(p6)	fmpy.s1 f8 = farg0, f10(p6)	fnma.s1 f9 = farg1, f10, f1	;;(p6)	fma.s1 f8 = f9, f8, f8(p6)	fmpy.s1 f9 = f9, f9	;;(p6)	fma.s1 f8 = f9, f8, f8(p6)	fmpy.s1 f9 = f9, f9	;;(p6)	fma.d.s1 f10 = f9, f8, f8	;;(p6)	fnorm.s.s0 fret0 = f10(p7)	mov fret0 = f10	br.ret.sptk rp	;;	.endp __divsf3#endif#ifdef L__divdi3// Compute a 64-bit integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __divdi3	.proc __divdi3__divdi3:	.regstk 2,0,0,0	// Transfer inputs to FP registers.	setf.sig f8 = in0	setf.sig f9 = in1	;;	// Convert the inputs to FP, so that they won't be treated as unsigned.	fcvt.xf f8 = f8	fcvt.xf f9 = f9	;;	// Compute the reciprocal approximation.	frcpa.s1 f10, p6 = f8, f9	;;	// 3 Newton-Raphson iterations.(p6)	fnma.s1 f11 = f9, f10, f1(p6)	fmpy.s1 f12 = f8, f10	;;(p6)	fmpy.s1 f13 = f11, f11(p6)	fma.s1 f12 = f11, f12, f12	;;(p6)	fma.s1 f10 = f11, f10, f10(p6)	fma.s1 f11 = f13, f12, f12	;;(p6)	fma.s1 f10 = f13, f10, f10(p6)	fnma.s1 f12 = f9, f11, f8	;;(p6)	fma.s1 f10 = f12, f10, f11	;;	// Round quotient to an integer.	fcvt.fx.trunc.s1 f10 = f10	;;	// Transfer result to GP registers.	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __divdi3#endif#ifdef L__moddi3// Compute a 64-bit integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend (a).  in1 holds the divisor (b).	.text	.align 16	.global __moddi3	.proc __moddi3__moddi3:	.regstk 2,0,0,0	// Transfer inputs to FP registers.	setf.sig f14 = in0	setf.sig f9 = in1	;;	// Convert the inputs to FP, so that they won't be treated as unsigned.	fcvt.xf f8 = f14	fcvt.xf f9 = f9	;;	// Compute the reciprocal approximation.	frcpa.s1 f10, p6 = f8, f9	;;	// 3 Newton-Raphson iterations.(p6)	fmpy.s1 f12 = f8, f10(p6)	fnma.s1 f11 = f9, f10, f1	;;(p6)	fma.s1 f12 = f11, f12, f12(p6)	fmpy.s1 f13 = f11, f11	;;(p6)	fma.s1 f10 = f11, f10, f10(p6)	fma.s1 f11 = f13, f12, f12	;;	sub in1 = r0, in1(p6)	fma.s1 f10 = f13, f10, f10(p6)	fnma.s1 f12 = f9, f11, f8	;;	setf.sig f9 = in1(p6)	fma.s1 f10 = f12, f10, f11	;;	fcvt.fx.trunc.s1 f10 = f10	;;	// r = q * (-b) + a	xma.l f10 = f10, f9, f14	;;	// Transfer result to GP registers.	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __moddi3#endif#ifdef L__udivdi3// Compute a 64-bit unsigned integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __udivdi3	.proc __udivdi3__udivdi3:	.regstk 2,0,0,0	// Transfer inputs to FP registers.	setf.sig f8 = in0	setf.sig f9 = in1	;;	// Convert the inputs to FP, to avoid FP software-assist faults.	fcvt.xuf.s1 f8 = f8	fcvt.xuf.s1 f9 = f9	;;	// Compute the reciprocal approximation.	frcpa.s1 f10, p6 = f8, f9	;;	// 3 Newton-Raphson iterations.(p6)	fnma.s1 f11 = f9, f10, f1(p6)	fmpy.s1 f12 = f8, f10	;;(p6)	fmpy.s1 f13 = f11, f11(p6)	fma.s1 f12 = f11, f12, f12	;;(p6)	fma.s1 f10 = f11, f10, f10(p6)	fma.s1 f11 = f13, f12, f12	;;(p6)	fma.s1 f10 = f13, f10, f10(p6)	fnma.s1 f12 = f9, f11, f8	;;(p6)	fma.s1 f10 = f12, f10, f11	;;	// Round quotient to an unsigned integer.	fcvt.fxu.trunc.s1 f10 = f10	;;	// Transfer result to GP registers.	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __udivdi3#endif#ifdef L__umoddi3// Compute a 64-bit unsigned integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend (a).  in1 holds the divisor (b).	.text	.align 16	.global __umoddi3	.proc __umoddi3__umoddi3:	.regstk 2,0,0,0	// Transfer inputs to FP registers.	setf.sig f14 = in0	setf.sig f9 = in1	;;	// Convert the inputs to FP, to avoid FP software assist faults.	fcvt.xuf.s1 f8 = f14	fcvt.xuf.s1 f9 = f9	;;	// Compute the reciprocal approximation.	frcpa.s1 f10, p6 = f8, f9	;;	// 3 Newton-Raphson iterations.(p6)	fmpy.s1 f12 = f8, f10(p6)	fnma.s1 f11 = f9, f10, f1	;;(p6)	fma.s1 f12 = f11, f12, f12(p6)	fmpy.s1 f13 = f11, f11	;;(p6)	fma.s1 f10 = f11, f10, f10(p6)	fma.s1 f11 = f13, f12, f12	;;	sub in1 = r0, in1(p6)	fma.s1 f10 = f13, f10, f10(p6)	fnma.s1 f12 = f9, f11, f8	;;	setf.sig f9 = in1(p6)	fma.s1 f10 = f12, f10, f11	;;	// Round quotient to an unsigned integer.	fcvt.fxu.trunc.s1 f10 = f10	;;	// r = q * (-b) + a	xma.l f10 = f10, f9, f14	;;	// Transfer result to GP registers.	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __umoddi3#endif#ifdef L__divsi3// Compute a 32-bit integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __divsi3	.proc __divsi3__divsi3:	.regstk 2,0,0,0	sxt4 in0 = in0	sxt4 in1 = in1	;;	setf.sig f8 = in0	setf.sig f9 = in1	;;	mov r2 = 0x0ffdd	fcvt.xf f8 = f8	fcvt.xf f9 = f9	;;	setf.exp f11 = r2	frcpa.s1 f10, p6 = f8, f9	;;(p6)	fmpy.s1 f8 = f8, f10(p6)	fnma.s1 f9 = f9, f10, f1	;;(p6)	fma.s1 f8 = f9, f8, f8(p6)	fma.s1 f9 = f9, f9, f11	;;(p6)	fma.s1 f10 = f9, f8, f8	;;	fcvt.fx.trunc.s1 f10 = f10	;;	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __divsi3#endif#ifdef L__modsi3// Compute a 32-bit integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __modsi3	.proc __modsi3__modsi3:	.regstk 2,0,0,0	mov r2 = 0x0ffdd	sxt4 in0 = in0	sxt4 in1 = in1	;;	setf.sig f13 = r32	setf.sig f9 = r33	;;	sub in1 = r0, in1	fcvt.xf f8 = f13	fcvt.xf f9 = f9	;;	setf.exp f11 = r2	frcpa.s1 f10, p6 = f8, f9	;;(p6)	fmpy.s1 f12 = f8, f10(p6)	fnma.s1 f10 = f9, f10, f1	;;	setf.sig f9 = in1(p6)	fma.s1 f12 = f10, f12, f12(p6)	fma.s1 f10 = f10, f10, f11		;;(p6)	fma.s1 f10 = f10, f12, f12	;;	fcvt.fx.trunc.s1 f10 = f10	;;	xma.l f10 = f10, f9, f13	;;	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __modsi3#endif#ifdef L__udivsi3// Compute a 32-bit unsigned integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __udivsi3	.proc __udivsi3__udivsi3:	.regstk 2,0,0,0	mov r2 = 0x0ffdd	zxt4 in0 = in0	zxt4 in1 = in1	;;	setf.sig f8 = in0	setf.sig f9 = in1	;;	fcvt.xf f8 = f8	fcvt.xf f9 = f9	;;	setf.exp f11 = r2	frcpa.s1 f10, p6 = f8, f9	;;(p6)	fmpy.s1 f8 = f8, f10(p6)	fnma.s1 f9 = f9, f10, f1	;;(p6)	fma.s1 f8 = f9, f8, f8(p6)	fma.s1 f9 = f9, f9, f11	;;(p6)	fma.s1 f10 = f9, f8, f8	;;	fcvt.fxu.trunc.s1 f10 = f10	;;	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __udivsi3#endif#ifdef L__umodsi3// Compute a 32-bit unsigned integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend.  in1 holds the divisor.	.text	.align 16	.global __umodsi3	.proc __umodsi3__umodsi3:	.regstk 2,0,0,0	mov r2 = 0x0ffdd	zxt4 in0 = in0	zxt4 in1 = in1	;;	setf.sig f13 = in0	setf.sig f9 = in1	;;	sub in1 = r0, in1	fcvt.xf f8 = f13	fcvt.xf f9 = f9	;;	setf.exp f11 = r2	frcpa.s1 f10, p6 = f8, f9	;;(p6)	fmpy.s1 f12 = f8, f10(p6)	fnma.s1 f10 = f9, f10, f1	;;	setf.sig f9 = in1(p6)	fma.s1 f12 = f10, f12, f12(p6)	fma.s1 f10 = f10, f10, f11	;;(p6)	fma.s1 f10 = f10, f12, f12	;;	fcvt.fxu.trunc.s1 f10 = f10	;;	xma.l f10 = f10, f9, f13	;;	getf.sig ret0 = f10	br.ret.sptk rp	;;	.endp __umodsi3#endif#ifdef L__save_stack_nonlocal// Notes on save/restore stack nonlocal: We read ar.bsp but write// ar.bspstore.  This is because ar.bsp can be read at all times// (independent of the RSE mode) but since it's read-only we need to// restore the value via ar.bspstore.  This is OK because// ar.bsp==ar.bspstore after executing "flushrs".// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)	.text	.align 16	.global __ia64_save_stack_nonlocal	.proc __ia64_save_stack_nonlocal__ia64_save_stack_nonlocal:	{ .mmf	  alloc r18 = ar.pfs, 2, 0, 0, 0	  mov r19 = ar.rsc	  ;;	}	{ .mmi	  flushrs	  st8 [in0] = in1, 24	  and r19 = 0x1c, r19	  ;;	}	{ .mmi	  st8 [in0] = r18, -16	  mov ar.rsc = r19	  or r19 = 0x3, r19	  ;;	}	{ .mmi	  mov r16 = ar.bsp	  mov r17 = ar.rnat	  adds r2 = 8, in0	  ;;	}	{ .mmi	  st8 [in0] = r16	  st8 [r2] = r17	}	{ .mib	  mov ar.rsc = r19	  br.ret.sptk.few rp	  ;;	}	.endp __ia64_save_stack_nonlocal#endif#ifdef L__nonlocal_goto// void __ia64_nonlocal_goto(void *target_label, void *save_area,//			     void *static_chain);	.text	.align 16	.global __ia64_nonlocal_goto	.proc __ia64_nonlocal_goto__ia64_nonlocal_goto:	{ .mmi	  alloc r20 = ar.pfs, 3, 0, 0, 0	  ld8 r12 = [in1], 8	  mov.ret.sptk rp = in0, .L0	  ;;	}	{ .mmf	  ld8 r16 = [in1], 8	  mov r19 = ar.rsc	  ;;	}	{ .mmi	  flushrs	  ld8 r17 = [in1], 8	  and r19 = 0x1c, r19	  ;;	}	{ .mmi	  ld8 r18 = [in1]	  mov ar.rsc = r19	  or r19 = 0x3, r19	  ;;	}	{ .mmi	  mov ar.bspstore = r16	  ;;	  mov ar.rnat = r17	  ;;	}	{ .mmi	  loadrs	  invala	  mov r15 = in2	  ;;	}.L0:	{ .mib	  mov ar.rsc = r19	  mov ar.pfs = r18	  br.ret.sptk.few rp	  ;;	}	.endp __ia64_nonlocal_goto#endif#ifdef L__restore_stack_nonlocal// This is mostly the same as nonlocal_goto above.// ??? This has not been tested yet.// void __ia64_restore_stack_nonlocal(void *save_area)	.text	.align 16	.global __ia64_restore_stack_nonlocal	.proc __ia64_restore_stack_nonlocal__ia64_restore_stack_nonlocal:	{ .mmf	  alloc r20 = ar.pfs, 4, 0, 0, 0	  ld8 r12 = [in0], 8	  ;;	}	{ .mmb	  ld8 r16=[in0], 8	  mov r19 = ar.rsc	  ;;	}	{ .mmi	  flushrs	  ld8 r17 = [in0], 8	  and r19 = 0x1c, r19	  ;;	}	{ .mmf	  ld8 r18 = [in0]	  mov ar.rsc = r19	  ;;	}	{ .mmi	  mov ar.bspstore = r16	  ;;	  mov ar.rnat = r17	  or r19 = 0x3, r19	  ;;	}	{ .mmf	  loadrs	  invala	  ;;	}.L0:	{ .mib	  mov ar.rsc = r19	  mov ar.pfs = r18	  br.ret.sptk.few rp	  ;;	}	.endp __ia64_restore_stack_nonlocal#endif#ifdef L__trampoline// Implement the nested function trampoline.  This is out of line// so that we don't have to bother with flushing the icache, as// well as making the on-stack trampoline smaller.//// The trampoline has the following form:////		+-------------------+ >//	TRAMP:	| __ia64_trampoline | |//		+-------------------+  > fake function descriptor//		| TRAMP+16          | |//		+-------------------+ >//		| target descriptor |//		+-------------------+//		| static link	    |//		+-------------------+	.text	.align 16	.global __ia64_trampoline	.proc __ia64_trampoline__ia64_trampoline:	{ .mmi	  ld8 r2 = [r1], 8	  ;;	  ld8 r15 = [r1]	}	{ .mmi	  ld8 r3 = [r2], 8	  ;;	  ld8 r1 = [r2]	  mov b6 = r3	}	{ .bbb	  br.sptk.many b6	  ;;	}	.endp __ia64_trampoline#endif
lib1funcs.asm - 源码说明

本页面展示了「linux下的gcc编译器」中的 lib1funcs.asm 源码文件，采用汇编编程语言编写，共 704 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与GCC编译器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?