📄 aes-ia64.s
字号:
// ====================================================================// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL// project. Rights for redistribution and usage in source and binary// forms are granted according to the OpenSSL license.// ====================================================================//// What's wrong with compiler generated code? Compiler never uses// variable 'shr' which is pairable with 'extr'/'dep' instructions.// Then it uses 'zxt' which is an I-type, but can be replaced with// 'and' which in turn can be assigned to M-port [there're double as// much M-ports as there're I-ports on Itanium 2]. By sacrificing few// registers for small constants (255, 24 and 16) to be used with// 'shr' and 'and' instructions I can achieve better ILP, Intruction// Level Parallelism, and performance. This code outperforms GCC 3.3// generated code by over factor of 2 (two), GCC 3.4 - by 70% and// HP C - by 40%. Measured best-case scenario, i.e. aligned// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.// Version 1.2 mitigates the hazard of cache-timing attacks by// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling// references to S-boxes for L2 cache latency, c) prefetching T[ed]4// prior last round. As result performance dropped to (26 + 15*rounds)// ticks per block or 11 cycles per byte processed with 128-bit key.// This is ~16% deterioration. For reference Itanium 2 L1 cache has// 64 bytes line size and L2 - 128 bytes....ident "aes-ia64.S, version 1.2".ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>".explicit.textrk0=r8; rk1=r9;pfssave=r2;lcsave=r10;prsave=r3;maskff=r11;twenty4=r14;sixteen=r15;te00=r16; te11=r17; te22=r18; te33=r19;te01=r20; te12=r21; te23=r22; te30=r23;te02=r24; te13=r25; te20=r26; te31=r27;te03=r28; te10=r29; te21=r30; te32=r31;// these are rotating...t0=r32; s0=r33;t1=r34; s1=r35;t2=r36; s2=r37;t3=r38; s3=r39;te0=r40; te1=r41; te2=r42; te3=r43;#if defined(_HPUX_SOURCE) && !defined(_LP64)# define ADDP addp4#else# define ADDP add#endif// Offsets from Te0#define TE0 0#define TE2 2#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)#define TE1 3#define TE3 1#else#define TE1 1#define TE3 3#endif// This implies that AES_KEY comprises 32-bit key schedule elements// even on LP64 platforms.#ifndef KSZ# define KSZ 4# define LDKEY ld4#endif.proc _ia64_AES_encrypt#// Input: rk0-rk1// te0// te3 as AES_KEY->rounds!!!// s0-s3// maskff,twenty4,sixteen// Output: r16,r20,r24,r28 as s0-s3// Clobber: r16-r31,rk0-rk1,r32-r43.align 32_ia64_AES_encrypt: .prologue .altrp b6 .body{ .mmi; alloc r16=ar.pfs,12,0,0,8 LDKEY t0=[rk0],2*KSZ mov pr.rot=1<<16 }{ .mmi; LDKEY t1=[rk1],2*KSZ add te1=TE1,te0 add te3=-3,te3 };;{ .mib; LDKEY t2=[rk0],2*KSZ mov ar.ec=2 }{ .mib; LDKEY t3=[rk1],2*KSZ add te2=TE2,te0 brp.loop.imp .Le_top,.Le_end-16 };;{ .mmi; xor s0=s0,t0 xor s1=s1,t1 mov ar.lc=te3 }{ .mmi; xor s2=s2,t2 xor s3=s3,t3 add te3=TE3,te0 };;.align 32.Le_top:{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] (p0) and te33=s3,maskff // 0/0:s3&0xff (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] (p0) and te30=s0,maskff // 0/1:s0&0xff (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] (p0) shladd te30=te30,3,te3 // 1/1:te3+s0 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff] (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0] (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8] (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8] (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8] (p0) shladd te21=te21,3,te2 // 4/3:te3+s2 (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24] (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24 (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8] (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16 (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24] (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24 (p0) and te31=s1,maskff };; // 5/2:s1&0xff{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16] (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16 (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24] (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16 (p0) and te32=s2,maskff };; // 6/3:s2&0xff{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16] (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24] (p0) shladd te32=te32,3,te3 // 7/3:te3+s2 (p0) xor t0=t0,te33 };; // 7/0:{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1] (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16 (p0) xor t0=t0,te22 } // 8/0:{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2] (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16 (p0) xor t1=t1,te30 };; // 8/1:{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16] (p0) ld4 te10=[te10] // 9/3:te1[s0>>16] (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1: (p0) xor t2=t2,te20 // 10[9]/2: (p0) xor t3=t3,te21 };; // 10[9]/3:{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done! (p0) xor t1=t1,te01 // 11[10]/1: (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3: (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17){ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done! (p0) xor t2=t2,te31 // 13[11]/2: (p0) xor t3=t3,te32 } // 13[11]/3:{ .mmi; (p17) add te0=2048,te0 // 13[11]/ (p17) add te1=2048+64-TE1,te1};; // 13[11]/{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done! (p17) add te2=2048+128-TE2,te2} // 14[12]/{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done! (p17) add te3=2048+192-TE3,te3 // 14[12]/ br.ctop.sptk .Le_top };;.Le_end:{ .mmi; ld8 te12=[te0] // prefetch Te4 ld8 te31=[te1] }{ .mmi; ld8 te10=[te2] ld8 te32=[te3] }{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0] and te33=s3,maskff // 0/0:s3&0xff extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1] and te30=s0,maskff // 0/1:s0&0xff shr.u te00=s0,twenty4 };; // 0/0:s0>>24{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2] add te33=te33,te0 // 1/0:te0+s0>>24 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3] add te30=te30,te0 // 1/1:te0+s0 shr.u te01=s1,twenty4 };; // 1/1:s1>>24{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff] add te22=te22,te0 // 2/0:te0+s2>>8&0xff extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff{ .mmi; ld1 te30=[te30] // 2/1:te0[s0] add te23=te23,te0 // 2/1:te0+s3>>8 shr.u te02=s2,twenty4 };; // 2/2:s2>>24{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8] add te20=te20,te0 // 3/2:te0+s0>>8 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8] add te00=te00,te0 // 3/0:te0+s0>>24 shr.u te03=s3,twenty4 };; // 3/3:s3>>24{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8] add te21=te21,te0 // 4/3:te0+s2 extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24] add te01=te01,te0 // 4/1:te0+s1>>24 shr.u te13=s3,sixteen };; // 4/2:s3>>16{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8] add te11=te11,te0 // 5/0:te0+s1>>16 extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24] add te02=te02,te0 // 5/2:te0+s2>>24 and te31=s1,maskff };; // 5/2:s1&0xff{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16] add te12=te12,te0 // 6/1:te0+s2>>16 extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24] add te03=te03,te0 // 6/3:te0+s0>>16 and te32=s2,maskff };; // 6/3:s2&0xff{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16] add te31=te31,te0 // 7/2:te0+s1&0xff dep te33=te22,te33,8,8} // 7/0:{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24] add te32=te32,te0 // 7/3:te0+s2 and te13=te13,maskff};; // 7/2:s3>>16&0xff{ .mmi; ld1 te31=[te31] // 8/2:te0[s1] add te13=te13,te0 // 8/2:te0+s3>>16 dep te30=te23,te30,8,8} // 8/1:{ .mmi; ld1 te32=[te32] // 8/3:te0[s2] add te10=te10,te0 // 8/3:te0+s0>>16 shl te00=te00,twenty4};; // 8/0:{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16] dep te33=te11,te33,16,8 // 9/0: shl te01=te01,twenty4};; // 9/1:{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16] dep te31=te20,te31,8,8 // 10/2: shl te02=te02,twenty4};; // 10/2:{ .mii; xor t0=t0,te33 // 11/0: dep te32=te21,te32,8,8 // 11/3: shl te12=te12,sixteen};; // 11/1:{ .mii; xor r16=t0,te00 // 12/0:done! dep te31=te13,te31,16,8 // 12/2: shl te03=te03,twenty4};; // 12/3:{ .mmi; xor t1=t1,te01 // 13/1: xor t2=t2,te02 // 13/2: dep te32=te10,te32,16,8};; // 13/3:{ .mmi; xor t1=t1,te30 // 14/1: xor r24=t2,te31 // 14/2:done! xor t3=t3,te32 };; // 14/3:{ .mib; xor r20=t1,te12 // 15/1:done! xor r28=t3,te03 // 15/3:done! br.ret.sptk b6 };;.endp _ia64_AES_encrypt#// void AES_encrypt (const void *in,void *out,const AES_KEY *key);.global AES_encrypt#.proc AES_encrypt#.align 32AES_encrypt: .prologue .save ar.pfs,pfssave{ .mmi; alloc pfssave=ar.pfs,3,1,12,0 and out0=3,in0 mov r3=ip }{ .mmi; ADDP in0=0,in0 mov loc0=psr.um ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds{ .mmi; ld4 out11=[out11] // AES_KEY->rounds add out8=(AES_Te#-AES_encrypt#),r3 // Te0 .save pr,prsave mov prsave=pr }{ .mmi; rum 1<<3 // clear um.ac .save ar.lc,lcsave mov lcsave=ar.lc };; .body#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...{ .mib; cmp.ne p6,p0=out0,r0 add out0=4,in0(p6) br.dpnt.many .Le_i_unaligned };;{ .mmi; ld4 out1=[in0],8 // s0 and out9=3,in1 mov twenty4=24 }{ .mmi; ld4 out3=[out0],8 // s1 ADDP rk0=0,in2 mov sixteen=16 };;{ .mmi; ld4 out5=[in0] // s2 cmp.ne p6,p0=out9,r0 mov maskff=0xff }{ .mmb; ld4 out7=[out0] // s3 ADDP rk1=KSZ,in2 br.call.sptk.many b6=_ia64_AES_encrypt };;{ .mib; ADDP in0=4,in1 ADDP in1=0,in1(p6) br.spnt .Le_o_unaligned };;{ .mii; mov psr.um=loc0 mov ar.pfs=pfssave mov ar.lc=lcsave };;{ .mmi; st4 [in1]=r16,8 // s0 st4 [in0]=r20,8 // s1 mov pr=prsave,0x1ffff };;{ .mmb; st4 [in1]=r24 // s2 st4 [in0]=r28 // s3 br.ret.sptk.many b0 };;#endif.align 32.Le_i_unaligned:{ .mmi; add out0=1,in0 add out2=2,in0 add out4=3,in0 };;{ .mmi; ld1 r16=[in0],4 ld1 r17=[out0],4 }//;;{ .mmi; ld1 r18=[out2],4 ld1 out1=[out4],4 };; // s0{ .mmi; ld1 r20=[in0],4 ld1 r21=[out0],4 }//;;{ .mmi; ld1 r22=[out2],4 ld1 out3=[out4],4 };; // s1{ .mmi; ld1 r24=[in0],4 ld1 r25=[out0],4 }//;;{ .mmi; ld1 r26=[out2],4 ld1 out5=[out4],4 };; // s2{ .mmi; ld1 r28=[in0] ld1 r29=[out0] }//;;{ .mmi; ld1 r30=[out2] ld1 out7=[out4] };; // s3{ .mii; dep out1=r16,out1,24,8 //;; dep out3=r20,out3,24,8 }//;;{ .mii; ADDP rk0=0,in2 dep out5=r24,out5,24,8 //;; dep out7=r28,out7,24,8 };;{ .mii; ADDP rk1=KSZ,in2 dep out1=r17,out1,16,8 //;; dep out3=r21,out3,16,8 }//;;{ .mii; mov twenty4=24 dep out5=r25,out5,16,8 //;; dep out7=r29,out7,16,8 };;{ .mii; mov sixteen=16 dep out1=r18,out1,8,8 //;; dep out3=r22,out3,8,8 }//;;{ .mii; mov maskff=0xff dep out5=r26,out5,8,8 //;; dep out7=r30,out7,8,8 };;{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;.Le_o_unaligned:{ .mii; ADDP out0=0,in1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -