📄 ia64.s
字号:
#ifdef XMA_TEMPTATION{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;#else{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;#endif{ .mib; mov r8=r0 // return value cmp4.le p6,p0=r34,r0(p6) br.ret.spnt.many b0 };; .save ar.lc,r3{ .mii; sub r10=r34,r0,1 mov r3=ar.lc mov r9=pr };; .body{ .mib; setf.sig f8=r35 // w mov pr.rot=0x800001<<16 // ------^----- serves as (p50) at first (p27) brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16 }#ifndef XMA_TEMPTATION{ .mmi; ADDP r14=0,r32 // rp ADDP r15=0,r33 // ap mov ar.lc=r10 }{ .mmi; mov r40=0 // serves as r35 at first (p27) mov ar.ec=13 };;// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium// L2 cache (i.e. 9 ticks away) as floating point load/store instructions// bypass L1 cache and L2 latency is actually best-case scenario for// ldf8. The loop is not scalable and shall run in 2*(n+12) even on// "wider" IA-64 implementations. It's a trade-off here. n+24 loop// would give us ~5% in *overall* performance improvement on "wider"// IA-64, but would hurt Itanium for about same because of longer// epilogue. As it's a matter of few percents in either case I've// chosen to trade the scalability for development time (you can see// this very instruction sequence in bn_mul_add_words loop which in// turn is scalable)..L_bn_mul_words_ctop:{ .mfi; (p25) getf.sig r36=f52 // low (p21) xmpy.lu f48=f37,f8 (p28) cmp.ltu p54,p50=r41,r39 }{ .mfi; (p16) ldf8 f32=[r15],8 (p21) xmpy.hu f40=f37,f8 (p0) nop.i 0x0 };;{ .mii; (p25) getf.sig r32=f44 // high .pred.rel "mutex",p50,p54 (p50) add r40=r38,r35 // (p27) (p54) add r40=r38,r35,1 } // (p27){ .mfb; (p28) st8 [r14]=r41,8 (p0) nop.f 0x0 br.ctop.sptk .L_bn_mul_words_ctop };;.L_bn_mul_words_cend:{ .mii; nop.m 0x0.pred.rel "mutex",p51,p55(p51) add r8=r36,r0(p55) add r8=r36,r0,1 }{ .mfb; nop.m 0x0 nop.f 0x0 nop.b 0x0 }#else // XMA_TEMPTATION setf.sig f37=r0 // serves as carry at (p18) tick mov ar.lc=r10 mov ar.ec=5;;// Most of you examining this code very likely wonder why in the name// of Intel the following loop is commented out? Indeed, it looks so// neat that you find it hard to believe that it's something wrong// with it, right? The catch is that every iteration depends on the// result from previous one and the latter isn't available instantly.// The loop therefore spins at the latency of xma minus 1, or in other// words at 6*(n+4) ticks:-( Compare to the "production" loop above// that runs in 2*(n+11) where the low latency problem is worked around// by moving the dependency to one-tick latent interger ALU. Note that// "distance" between ldf8 and xma is not latency of ldf8, but the// *difference* between xma and ldf8 latencies..L_bn_mul_words_ctop:{ .mfi; (p16) ldf8 f32=[r33],8 (p18) xma.hu f38=f34,f8,f39 }{ .mfb; (p20) stf8 [r32]=f37,8 (p18) xma.lu f35=f34,f8,f39 br.ctop.sptk .L_bn_mul_words_ctop };;.L_bn_mul_words_cend: getf.sig r8=f41 // the return value#endif // XMA_TEMPTATION{ .mii; nop.m 0x0 mov pr=r9,0x1ffff mov ar.lc=r3 }{ .mfb; rum 1<<5 // clear um.mfh nop.f 0x0 br.ret.sptk.many b0 };;.endp bn_mul_words##endif#if 1//// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)//.global bn_mul_add_words#.proc bn_mul_add_words#.align 64.skip 48 // makes the loop body aligned at 64-byte boundarybn_mul_add_words: .prologue .fframe 0 .save ar.pfs,r2 .save ar.lc,r3 .save pr,r9{ .mmi; alloc r2=ar.pfs,4,4,0,8 cmp4.le p6,p0=r34,r0 mov r3=ar.lc };;{ .mib; mov r8=r0 // return value sub r10=r34,r0,1(p6) br.ret.spnt.many b0 };; .body{ .mib; setf.sig f8=r35 // w mov r9=pr brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 }{ .mmi; ADDP r14=0,r32 // rp ADDP r15=0,r33 // ap mov ar.lc=r10 }{ .mii; ADDP r16=0,r32 // rp copy mov pr.rot=0x2001<<16 // ------^----- serves as (p40) at first (p27) mov ar.ec=11 };;// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on// Itanium 2. Yes, unlike previous versions it scales:-) Previous// version was peforming *all* additions in IALU and was starving// for those even on Itanium 2. In this version one addition is// moved to FPU and is folded with multiplication. This is at cost// of propogating the result from previous call to this subroutine// to L2 cache... In other words negligible even for shorter keys.// *Overall* performance improvement [over previous version] varies// from 11 to 22 percent depending on key length..L_bn_mul_add_words_ctop:.pred.rel "mutex",p40,p42{ .mfi; (p23) getf.sig r36=f45 // low (p20) xma.lu f42=f36,f8,f50 // low (p40) add r39=r39,r35 } // (p27){ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++) (p20) xma.hu f36=f36,f8,f50 // high (p42) add r39=r39,r35,1 };; // (p27){ .mmi; (p24) getf.sig r32=f40 // high (p16) ldf8 f46=[r16],8 // *(rp1++) (p40) cmp.ltu p41,p39=r39,r35 } // (p27){ .mib; (p26) st8 [r14]=r39,8 // *(rp2++) (p42) cmp.leu p41,p39=r39,r35 // (p27) br.ctop.sptk .L_bn_mul_add_words_ctop};;.L_bn_mul_add_words_cend:{ .mmi; .pred.rel "mutex",p40,p42(p40) add r8=r35,r0(p42) add r8=r35,r0,1 mov pr=r9,0x1ffff }{ .mib; rum 1<<5 // clear um.mfh mov ar.lc=r3 br.ret.sptk.many b0 };;.endp bn_mul_add_words##endif#if 1//// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)//.global bn_sqr_words#.proc bn_sqr_words#.align 64.skip 32 // makes the loop body aligned at 64-byte boundary bn_sqr_words: .prologue .fframe 0 .save ar.pfs,r2{ .mii; alloc r2=ar.pfs,3,0,0,0 sxt4 r34=r34 };;{ .mii; cmp.le p6,p0=r34,r0 mov r8=r0 } // return value{ .mfb; ADDP r32=0,r32 nop.f 0x0(p6) br.ret.spnt.many b0 };; .save ar.lc,r3{ .mii; sub r10=r34,r0,1 mov r3=ar.lc mov r9=pr };; .body{ .mib; ADDP r33=0,r33 mov pr.rot=1<<16 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 }{ .mii; add r34=8,r32 mov ar.lc=r10 mov ar.ec=18 };;// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's// possible to compress the epilogue (I'm getting tired to write this// comment over and over) and get down to 2*n+16 at the cost of// scalability. The decision will very likely be reconsidered after the// benchmark program is profiled. I.e. if perfomance gain on Itanium// will appear larger than loss on "wider" IA-64, then the loop should// be explicitely split and the epilogue compressed..L_bn_sqr_words_ctop:{ .mfi; (p16) ldf8 f32=[r33],8 (p25) xmpy.lu f42=f41,f41 (p0) nop.i 0x0 }{ .mib; (p33) stf8 [r32]=f50,16 (p0) nop.i 0x0 (p0) nop.b 0x0 }{ .mfi; (p0) nop.m 0x0 (p25) xmpy.hu f52=f41,f41 (p0) nop.i 0x0 }{ .mib; (p33) stf8 [r34]=f60,16 (p0) nop.i 0x0 br.ctop.sptk .L_bn_sqr_words_ctop };;.L_bn_sqr_words_cend:{ .mii; nop.m 0x0 mov pr=r9,0x1ffff mov ar.lc=r3 }{ .mfb; rum 1<<5 // clear um.mfh nop.f 0x0 br.ret.sptk.many b0 };;.endp bn_sqr_words##endif#if 1// Apparently we win nothing by implementing special bn_sqr_comba8.// Yes, it is possible to reduce the number of multiplications by// almost factor of two, but then the amount of additions would// increase by factor of two (as we would have to perform those// otherwise performed by xma ourselves). Normally we would trade// anyway as multiplications are way more expensive, but not this// time... Multiplication kernel is fully pipelined and as we drain// one 128-bit multiplication result per clock cycle multiplications// are effectively as inexpensive as additions. Special implementation// might become of interest for "wider" IA-64 implementation as you'll// be able to get through the multiplication phase faster (there won't// be any stall issues as discussed in the commentary section below and// you therefore will be able to employ all 4 FP units)... But these// Itanium days it's simply too hard to justify the effort so I just// drop down to bn_mul_comba8 code:-)//// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)//.global bn_sqr_comba8#.proc bn_sqr_comba8#.align 64bn_sqr_comba8: .prologue .fframe 0 .save ar.pfs,r2#if defined(_HPUX_SOURCE) && !defined(_LP64){ .mii; alloc r2=ar.pfs,2,1,0,0 addp4 r33=0,r33 addp4 r32=0,r32 };;{ .mii;#else{ .mii; alloc r2=ar.pfs,2,1,0,0#endif mov r34=r33 add r14=8,r33 };; .body{ .mii; add r17=8,r34 add r15=16,r33 add r18=16,r34 }{ .mfb; add r16=24,r33 br .L_cheat_entry_point8 };;.endp bn_sqr_comba8##endif#if 1// I've estimated this routine to run in ~120 ticks, but in reality// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra// cycles consumed for instructions fetch? Or did I misinterpret some// clause in Itanium
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -