📄 ia64.s

📁 openssl using to improve the ability to program using it. it is a good tools
💻 S
📖 第 1 页 / 共 2 页
字号:
上一页 12
#ifdef XMA_TEMPTATION{ .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;#else{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;#endif{ .mib;	mov		r8=r0			// return value	cmp4.le		p6,p0=r34,r0(p6)	br.ret.spnt.many	b0		};;	.save	ar.lc,r3{ .mii;	sub	r10=r34,r0,1	mov	r3=ar.lc	mov	r9=pr			};;	.body{ .mib;	setf.sig	f8=r35	// w	mov		pr.rot=0x800001<<16			// ------^----- serves as (p50) at first (p27)	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16					}#ifndef XMA_TEMPTATION{ .mmi;	ADDP		r14=0,r32	// rp	ADDP		r15=0,r33	// ap	mov		ar.lc=r10	}{ .mmi;	mov		r40=0		// serves as r35 at first (p27)	mov		ar.ec=13	};;// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium// L2 cache (i.e. 9 ticks away) as floating point load/store instructions// bypass L1 cache and L2 latency is actually best-case scenario for// ldf8. The loop is not scalable and shall run in 2*(n+12) even on// "wider" IA-64 implementations. It's a trade-off here. n+24 loop// would give us ~5% in *overall* performance improvement on "wider"// IA-64, but would hurt Itanium for about same because of longer// epilogue. As it's a matter of few percents in either case I've// chosen to trade the scalability for development time (you can see// this very instruction sequence in bn_mul_add_words loop which in// turn is scalable)..L_bn_mul_words_ctop:{ .mfi;	(p25)	getf.sig	r36=f52			// low	(p21)	xmpy.lu		f48=f37,f8	(p28)	cmp.ltu		p54,p50=r41,r39	}{ .mfi;	(p16)	ldf8		f32=[r15],8	(p21)	xmpy.hu		f40=f37,f8	(p0)	nop.i		0x0		};;{ .mii;	(p25)	getf.sig	r32=f44			// high	.pred.rel	"mutex",p50,p54	(p50)	add		r40=r38,r35		// (p27)	(p54)	add		r40=r38,r35,1	}	// (p27){ .mfb;	(p28)	st8		[r14]=r41,8	(p0)	nop.f		0x0	br.ctop.sptk	.L_bn_mul_words_ctop	};;.L_bn_mul_words_cend:{ .mii;	nop.m		0x0.pred.rel	"mutex",p51,p55(p51)	add		r8=r36,r0(p55)	add		r8=r36,r0,1	}{ .mfb;	nop.m	0x0	nop.f	0x0	nop.b	0x0			}#else	// XMA_TEMPTATION	setf.sig	f37=r0	// serves as carry at (p18) tick	mov		ar.lc=r10	mov		ar.ec=5;;// Most of you examining this code very likely wonder why in the name// of Intel the following loop is commented out? Indeed, it looks so// neat that you find it hard to believe that it's something wrong// with it, right? The catch is that every iteration depends on the// result from previous one and the latter isn't available instantly.// The loop therefore spins at the latency of xma minus 1, or in other// words at 6*(n+4) ticks:-( Compare to the "production" loop above// that runs in 2*(n+11) where the low latency problem is worked around// by moving the dependency to one-tick latent interger ALU. Note that// "distance" between ldf8 and xma is not latency of ldf8, but the// *difference* between xma and ldf8 latencies..L_bn_mul_words_ctop:{ .mfi;	(p16)	ldf8		f32=[r33],8	(p18)	xma.hu		f38=f34,f8,f39	}{ .mfb;	(p20)	stf8		[r32]=f37,8	(p18)	xma.lu		f35=f34,f8,f39	br.ctop.sptk	.L_bn_mul_words_ctop	};;.L_bn_mul_words_cend:	getf.sig	r8=f41		// the return value#endif	// XMA_TEMPTATION{ .mii;	nop.m		0x0	mov		pr=r9,0x1ffff	mov		ar.lc=r3	}{ .mfb;	rum		1<<5		// clear um.mfh	nop.f		0x0	br.ret.sptk.many	b0	};;.endp	bn_mul_words##endif#if 1//// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)//.global	bn_mul_add_words#.proc	bn_mul_add_words#.align	64.skip	48	// makes the loop body aligned at 64-byte boundarybn_mul_add_words:	.prologue	.fframe	0	.save	ar.pfs,r2	.save	ar.lc,r3	.save	pr,r9{ .mmi;	alloc		r2=ar.pfs,4,4,0,8	cmp4.le		p6,p0=r34,r0	mov		r3=ar.lc	};;{ .mib;	mov		r8=r0		// return value	sub		r10=r34,r0,1(p6)	br.ret.spnt.many	b0	};;	.body{ .mib;	setf.sig	f8=r35		// w	mov		r9=pr	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16					}{ .mmi;	ADDP		r14=0,r32	// rp	ADDP		r15=0,r33	// ap	mov		ar.lc=r10	}{ .mii;	ADDP		r16=0,r32	// rp copy	mov		pr.rot=0x2001<<16			// ------^----- serves as (p40) at first (p27)	mov		ar.ec=11	};;// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on// Itanium 2. Yes, unlike previous versions it scales:-) Previous// version was peforming *all* additions in IALU and was starving// for those even on Itanium 2. In this version one addition is// moved to FPU and is folded with multiplication. This is at cost// of propogating the result from previous call to this subroutine// to L2 cache... In other words negligible even for shorter keys.// *Overall* performance improvement [over previous version] varies// from 11 to 22 percent depending on key length..L_bn_mul_add_words_ctop:.pred.rel	"mutex",p40,p42{ .mfi;	(p23)	getf.sig	r36=f45			// low	(p20)	xma.lu		f42=f36,f8,f50		// low	(p40)	add		r39=r39,r35	}	// (p27){ .mfi;	(p16)	ldf8		f32=[r15],8		// *(ap++)	(p20)	xma.hu		f36=f36,f8,f50		// high	(p42)	add		r39=r39,r35,1	};;	// (p27){ .mmi;	(p24)	getf.sig	r32=f40			// high	(p16)	ldf8		f46=[r16],8		// *(rp1++)	(p40)	cmp.ltu		p41,p39=r39,r35	}	// (p27){ .mib;	(p26)	st8		[r14]=r39,8		// *(rp2++)	(p42)	cmp.leu		p41,p39=r39,r35		// (p27)	br.ctop.sptk	.L_bn_mul_add_words_ctop};;.L_bn_mul_add_words_cend:{ .mmi;	.pred.rel	"mutex",p40,p42(p40)	add		r8=r35,r0(p42)	add		r8=r35,r0,1	mov		pr=r9,0x1ffff	}{ .mib;	rum		1<<5		// clear um.mfh	mov		ar.lc=r3	br.ret.sptk.many	b0	};;.endp	bn_mul_add_words##endif#if 1//// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)//.global	bn_sqr_words#.proc	bn_sqr_words#.align	64.skip	32	// makes the loop body aligned at 64-byte boundary bn_sqr_words:	.prologue	.fframe	0	.save	ar.pfs,r2{ .mii;	alloc		r2=ar.pfs,3,0,0,0	sxt4		r34=r34		};;{ .mii;	cmp.le		p6,p0=r34,r0	mov		r8=r0		}	// return value{ .mfb;	ADDP		r32=0,r32	nop.f		0x0(p6)	br.ret.spnt.many	b0	};;	.save	ar.lc,r3{ .mii;	sub	r10=r34,r0,1	mov	r3=ar.lc	mov	r9=pr			};;	.body{ .mib;	ADDP		r33=0,r33	mov		pr.rot=1<<16	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16					}{ .mii;	add		r34=8,r32	mov		ar.lc=r10	mov		ar.ec=18	};;// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's// possible to compress the epilogue (I'm getting tired to write this// comment over and over) and get down to 2*n+16 at the cost of// scalability. The decision will very likely be reconsidered after the// benchmark program is profiled. I.e. if perfomance gain on Itanium// will appear larger than loss on "wider" IA-64, then the loop should// be explicitely split and the epilogue compressed..L_bn_sqr_words_ctop:{ .mfi;	(p16)	ldf8		f32=[r33],8	(p25)	xmpy.lu		f42=f41,f41	(p0)	nop.i		0x0		}{ .mib;	(p33)	stf8		[r32]=f50,16	(p0)	nop.i		0x0	(p0)	nop.b		0x0		}{ .mfi;	(p0)	nop.m		0x0	(p25)	xmpy.hu		f52=f41,f41	(p0)	nop.i		0x0		}{ .mib;	(p33)	stf8		[r34]=f60,16	(p0)	nop.i		0x0	br.ctop.sptk	.L_bn_sqr_words_ctop	};;.L_bn_sqr_words_cend:{ .mii;	nop.m		0x0	mov		pr=r9,0x1ffff	mov		ar.lc=r3	}{ .mfb;	rum		1<<5		// clear um.mfh	nop.f		0x0	br.ret.sptk.many	b0	};;.endp	bn_sqr_words##endif#if 1// Apparently we win nothing by implementing special bn_sqr_comba8.// Yes, it is possible to reduce the number of multiplications by// almost factor of two, but then the amount of additions would// increase by factor of two (as we would have to perform those// otherwise performed by xma ourselves). Normally we would trade// anyway as multiplications are way more expensive, but not this// time... Multiplication kernel is fully pipelined and as we drain// one 128-bit multiplication result per clock cycle multiplications// are effectively as inexpensive as additions. Special implementation// might become of interest for "wider" IA-64 implementation as you'll// be able to get through the multiplication phase faster (there won't// be any stall issues as discussed in the commentary section below and// you therefore will be able to employ all 4 FP units)... But these// Itanium days it's simply too hard to justify the effort so I just// drop down to bn_mul_comba8 code:-)//// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)//.global	bn_sqr_comba8#.proc	bn_sqr_comba8#.align	64bn_sqr_comba8:	.prologue	.fframe	0	.save	ar.pfs,r2#if defined(_HPUX_SOURCE) && !defined(_LP64){ .mii;	alloc	r2=ar.pfs,2,1,0,0	addp4	r33=0,r33	addp4	r32=0,r32		};;{ .mii;#else{ .mii;	alloc	r2=ar.pfs,2,1,0,0#endif	mov	r34=r33	add	r14=8,r33		};;	.body{ .mii;	add	r17=8,r34	add	r15=16,r33	add	r18=16,r34		}{ .mfb;	add	r16=24,r33	br	.L_cheat_entry_point8	};;.endp	bn_sqr_comba8##endif#if 1// I've estimated this routine to run in ~120 ticks, but in reality// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra// cycles consumed for instructions fetch? Or did I misinterpret some// clause in Itanium
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -