📄 sha512-ia64.pl
字号:
#!/usr/bin/env perl## ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. The module is, however, dual licensed under OpenSSL and# CRYPTOGAMS licenses depending on where you obtain it. For further# details see http://www.openssl.org/~appro/cryptogams/.# ====================================================================## SHA256/512_Transform for Itanium.## sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%# faster than gcc and >60%(!) faster than code generated by HP-UX# compiler (yes, HP-UX is generating slower code, because unlike gcc,# it failed to deploy "shift right pair," 'shrp' instruction, which# substitutes for 64-bit rotate).## 924 cycles long sha256_block outperforms gcc by over factor of 2(!)# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost# this one big time). Note that "formally" 924 is about 100 cycles# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,# are spent on extra work to provide for 32-bit rotations. 32-bit# rotations are still handled by 'shrp' instruction and for this# reason lower 32 bits are deposited to upper half of 64-bit register# prior 'shrp' issue. And in order to minimize the amount of such# operations, X[16] values are *maintained* with copies of lower# halves in upper halves, which is why you'll spot such instructions# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel# 32-bit unsigned right shift," 'pshr4.u' instructions here.## Rules of engagement.## There is only one integer shifter meaning that if I have two rotate,# deposit or extract instructions in adjacent bundles, they shall# split [at run-time if they have to]. But note that variable and# parallel shifts are performed by multi-media ALU and *are* pairable# with rotates [and alike]. On the backside MMALU is rather slow: it# takes 2 extra cycles before the result of integer operation is# available *to* MMALU and 2(*) extra cycles before the result of MM# operation is available "back" *to* integer ALU, not to mention that# MMALU itself has 2 cycles latency. However! I explicitly scheduled# these MM instructions to avoid MM stalls, so that all these extra# latencies get "hidden" in instruction-level parallelism.## (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule# for 2 in order to provide for best *overall* performance,# because on Itanium 1 stall on MM result is accompanied by# pipeline flush, which takes 6 cycles:-(## Resulting performance numbers for 900MHz Itanium 2 system:## The 'numbers' are in 1000s of bytes per second processed.# type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes# sha1(*) 6210.14k 20376.30k 52447.83k 85870.05k 105478.12k# sha256 7476.45k 20572.05k 41538.34k 56062.29k 62093.18k# sha512 4996.56k 20026.28k 47597.20k 85278.79k 111501.31k## (*) SHA1 numbers are for HP-UX compiler and are presented purely# for reference purposes. I bet it can improved too...## To generate code, pass the file name with either 256 or 512 in its# name and compiler flags.$output=shift;if ($output =~ /512.*\.[s|asm]/) { $SZ=8; $BITS=8*$SZ; $LDW="ld8"; $STW="st8"; $ADD="add"; $SHRU="shr.u"; $TABLE="K512"; $func="sha512_block_data_order"; @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80;} elsif ($output =~ /256.*\.[s|asm]/) { $SZ=4; $BITS=8*$SZ; $LDW="ld4"; $STW="st4"; $ADD="padd4"; $SHRU="pshr4.u"; $TABLE="K256"; $func="sha256_block_data_order"; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64;} else { die "nonsense $output"; }open STDOUT,">$output" || die "can't open $output: $!";if ($^O eq "hpux") { $ADDP="addp4"; for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }} else { $ADDP="add"; }for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); $big_endian=0 if (/\-DL_ENDIAN/); }if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }$code=<<___;.ident \"$output, version 1.1\".ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\".explicit.textpfssave=r2;lcsave=r3;prsave=r14;K=r15;A=r16; B=r17; C=r18; D=r19;E=r20; F=r21; G=r22; H=r23;T1=r24; T2=r25;s0=r26; s1=r27; t0=r28; t1=r29;Ktbl=r30;ctx=r31; // 1st arginput=r48; // 2nd argnum=r49; // 3rd argsgm0=r50; sgm1=r51; // small constantsA_=r54; B_=r55; C_=r56; D_=r57;E_=r58; F_=r59; G_=r60; H_=r61;// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]).global $func#.proc $func#.align 32$func: .prologue .save ar.pfs,pfssave{ .mmi; alloc pfssave=ar.pfs,3,27,0,16 $ADDP ctx=0,r32 // 1st arg .save ar.lc,lcsave mov lcsave=ar.lc }{ .mmi; $ADDP input=0,r33 // 2nd arg mov num=r34 // 3rd arg .save pr,prsave mov prsave=pr };; .body{ .mib; add r8=0*$SZ,ctx add r9=1*$SZ,ctx brp.loop.imp .L_first16,.L_first16_end-16 }{ .mib; add r10=2*$SZ,ctx add r11=3*$SZ,ctx brp.loop.imp .L_rest,.L_rest_end-16 };;// load A-H.Lpic_point:{ .mmi; $LDW A_=[r8],4*$SZ $LDW B_=[r9],4*$SZ mov Ktbl=ip }{ .mmi; $LDW C_=[r10],4*$SZ $LDW D_=[r11],4*$SZ mov sgm0=$sigma0[2] };;{ .mmi; $LDW E_=[r8] $LDW F_=[r9] add Ktbl=($TABLE#-.Lpic_point),Ktbl }{ .mmi; $LDW G_=[r10] $LDW H_=[r11] cmp.ne p0,p16=0,r0 };; // used in sha256_block___$code.=<<___ if ($BITS==64);{ .mii; and r8=7,input and input=~7,input;; cmp.eq p9,p0=1,r8 }{ .mmi; cmp.eq p10,p0=2,r8 cmp.eq p11,p0=3,r8 cmp.eq p12,p0=4,r8 }{ .mmi; cmp.eq p13,p0=5,r8 cmp.eq p14,p0=6,r8 cmp.eq p15,p0=7,r8 };;___$code.=<<___;.L_outer:.rotr X[16]{ .mmi; mov A=A_ mov B=B_ mov ar.lc=14 }{ .mmi; mov C=C_ mov D=D_ mov E=E_ }{ .mmi; mov F=F_ mov G=G_ mov ar.ec=2 }{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit mov H=H_ mov sgm1=$sigma1[2] };;___$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);.align 32.L_first16:{ .mmi; add r9=1-$SZ,input add r10=2-$SZ,input add r11=3-$SZ,input };;{ .mmi; ld1 r9=[r9] ld1 r10=[r10] dep.z $t1=E,32,32 }{ .mmi; $LDW K=[Ktbl],$SZ ld1 r11=[r11] zxt4 E=E };;{ .mii; or $t1=$t1,E dep X[15]=X[15],r9,8,8 dep r11=r10,r11,8,8 };;{ .mmi; and T1=F,E and T2=A,B dep X[15]=X[15],r11,16,16 }{ .mmi; andcm r8=G,E and r9=A,C mux2 $t0=A,0x44 };; // copy lower half to upper{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14){ .mib; and r10=B,C xor T2=T2,r9 };;___$t0="A", $t1="E", $code.=<<___ if ($BITS==64);// in 64-bit mode I load whole X[16] at once and take care of alignment...{ .mmi; add r8=1*$SZ,input add r9=2*$SZ,input add r10=3*$SZ,input };;{ .mmb; $LDW X[15]=[input],4*$SZ $LDW X[14]=[r8],4*$SZ(p9) br.cond.dpnt.many .L1byte };;{ .mmb; $LDW X[13]=[r9],4*$SZ $LDW X[12]=[r10],4*$SZ(p10) br.cond.dpnt.many .L2byte };;{ .mmb; $LDW X[11]=[input],4*$SZ $LDW X[10]=[r8],4*$SZ(p11) br.cond.dpnt.many .L3byte };;{ .mmb; $LDW X[ 9]=[r9],4*$SZ $LDW X[ 8]=[r10],4*$SZ(p12) br.cond.dpnt.many .L4byte };;{ .mmb; $LDW X[ 7]=[input],4*$SZ $LDW X[ 6]=[r8],4*$SZ(p13) br.cond.dpnt.many .L5byte };;{ .mmb; $LDW X[ 5]=[r9],4*$SZ $LDW X[ 4]=[r10],4*$SZ(p14) br.cond.dpnt.many .L6byte };;{ .mmb; $LDW X[ 3]=[input],4*$SZ $LDW X[ 2]=[r8],4*$SZ(p15) br.cond.dpnt.many .L7byte };;{ .mmb; $LDW X[ 1]=[r9],4*$SZ $LDW X[ 0]=[r10],4*$SZ br.many .L_first16 };;.L1byte:{ .mmi; $LDW X[13]=[r9],4*$SZ $LDW X[12]=[r10],4*$SZ shrp X[15]=X[15],X[14],56 };;{ .mmi; $LDW X[11]=[input],4*$SZ $LDW X[10]=[r8],4*$SZ shrp X[14]=X[14],X[13],56 }{ .mmi; $LDW X[ 9]=[r9],4*$SZ $LDW X[ 8]=[r10],4*$SZ shrp X[13]=X[13],X[12],56 };;{ .mmi; $LDW X[ 7]=[input],4*$SZ $LDW X[ 6]=[r8],4*$SZ shrp X[12]=X[12],X[11],56 }{ .mmi; $LDW X[ 5]=[r9],4*$SZ $LDW X[ 4]=[r10],4*$SZ shrp X[11]=X[11],X[10],56 };;{ .mmi; $LDW X[ 3]=[input],4*$SZ $LDW X[ 2]=[r8],4*$SZ shrp X[10]=X[10],X[ 9],56 }{ .mmi; $LDW X[ 1]=[r9],4*$SZ $LDW X[ 0]=[r10],4*$SZ shrp X[ 9]=X[ 9],X[ 8],56 };;{ .mii; $LDW T1=[input] shrp X[ 8]=X[ 8],X[ 7],56 shrp X[ 7]=X[ 7],X[ 6],56 }{ .mii; shrp X[ 6]=X[ 6],X[ 5],56 shrp X[ 5]=X[ 5],X[ 4],56 };;{ .mii; shrp X[ 4]=X[ 4],X[ 3],56 shrp X[ 3]=X[ 3],X[ 2],56 }{ .mii; shrp X[ 2]=X[ 2],X[ 1],56 shrp X[ 1]=X[ 1],X[ 0],56 }{ .mib; shrp X[ 0]=X[ 0],T1,56 br.many .L_first16 };;.L2byte:{ .mmi; $LDW X[11]=[input],4*$SZ $LDW X[10]=[r8],4*$SZ shrp X[15]=X[15],X[14],48 }{ .mmi; $LDW X[ 9]=[r9],4*$SZ $LDW X[ 8]=[r10],4*$SZ shrp X[14]=X[14],X[13],48 };;{ .mmi; $LDW X[ 7]=[input],4*$SZ $LDW X[ 6]=[r8],4*$SZ shrp X[13]=X[13],X[12],48 }{ .mmi; $LDW X[ 5]=[r9],4*$SZ $LDW X[ 4]=[r10],4*$SZ shrp X[12]=X[12],X[11],48 };;{ .mmi; $LDW X[ 3]=[input],4*$SZ $LDW X[ 2]=[r8],4*$SZ shrp X[11]=X[11],X[10],48 }{ .mmi; $LDW X[ 1]=[r9],4*$SZ $LDW X[ 0]=[r10],4*$SZ shrp X[10]=X[10],X[ 9],48 };;{ .mii; $LDW T1=[input] shrp X[ 9]=X[ 9],X[ 8],48 shrp X[ 8]=X[ 8],X[ 7],48 }{ .mii; shrp X[ 7]=X[ 7],X[ 6],48 shrp X[ 6]=X[ 6],X[ 5],48 };;{ .mii; shrp X[ 5]=X[ 5],X[ 4],48 shrp X[ 4]=X[ 4],X[ 3],48 }{ .mii; shrp X[ 3]=X[ 3],X[ 2],48 shrp X[ 2]=X[ 2],X[ 1],48 }{ .mii; shrp X[ 1]=X[ 1],X[ 0],48 shrp X[ 0]=X[ 0],T1,48 }{ .mfb; br.many .L_first16 };;.L3byte:{ .mmi; $LDW X[ 9]=[r9],4*$SZ $LDW X[ 8]=[r10],4*$SZ shrp X[15]=X[15],X[14],40 };;{ .mmi; $LDW X[ 7]=[input],4*$SZ $LDW X[ 6]=[r8],4*$SZ shrp X[14]=X[14],X[13],40 }{ .mmi; $LDW X[ 5]=[r9],4*$SZ $LDW X[ 4]=[r10],4*$SZ shrp X[13]=X[13],X[12],40 };;{ .mmi; $LDW X[ 3]=[input],4*$SZ $LDW X[ 2]=[r8],4*$SZ shrp X[12]=X[12],X[11],40 }{ .mmi; $LDW X[ 1]=[r9],4*$SZ $LDW X[ 0]=[r10],4*$SZ shrp X[11]=X[11],X[10],40 };;{ .mii; $LDW T1=[input] shrp X[10]=X[10],X[ 9],40 shrp X[ 9]=X[ 9],X[ 8],40 }{ .mii; shrp X[ 8]=X[ 8],X[ 7],40 shrp X[ 7]=X[ 7],X[ 6],40 };;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -