📄 ppc.pl
字号:
#!/usr/bin/env perl## Implemented as a Perl wrapper as we want to support several different# architectures with single file. We pick up the target based on the# file name we are asked to generate.## It should be noted though that this perl code is nothing like# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much# as pre-processor to cover for platform differences in name decoration,# linker tables, 32-/64-bit instruction sets...## As you might know there're several PowerPC ABI in use. Most notably# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs# are similar enough to implement leaf(!) functions, which would be ABI# neutral. And that's what you find here: ABI neutral leaf functions.# In case you wonder what that is...## AIX performance## MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.## The following is the performance of 32-bit compiler# generated code:## OpenSSL 0.9.6c 21 dec 2001# built on: Tue Jun 11 11:06:51 EDT 2002# options:bn(64,32) ...#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3# sign verify sign/s verify/s#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4#dsa 512 bits 0.0087s 0.0106s 114.3 94.5#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 ## Same bechmark with this assembler code:##rsa 512 bits 0.0056s 0.0005s 178.6 2049.2#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7#dsa 512 bits 0.0052s 0.0062s 191.6 162.0#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5## Number of operations increases by at almost 75%## Here are performance numbers for 64-bit compiler# generated code:## OpenSSL 0.9.6g [engine] 9 Aug 2002# built on: Fri Apr 18 16:59:20 EDT 2003# options:bn(64,64) ...# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3# sign verify sign/s verify/s#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1#dsa 512 bits 0.0026s 0.0032s 382.5 313.7#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6## Same benchmark with this assembler code:##rsa 512 bits 0.0020s 0.0002s 510.4 6273.7#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0#dsa 512 bits 0.0016s 0.0020s 610.7 507.1#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2# # Again, performance increases by at about 75%## Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)# OpenSSL 0.9.7c 30 Sep 2003## Original code.##rsa 512 bits 0.0011s 0.0001s 906.1 11012.5#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6## Same benchmark with this assembler code:##rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8## Performance increase of ~60%## If you have comments or suggestions to improve code send# me a note at schari@us.ibm.com#$opf = shift;if ($opf =~ /32\.s/) { $BITS= 32; $BNSZ= $BITS/8; $ISA= "\"ppc\""; $LD= "lwz"; # load $LDU= "lwzu"; # load and update $ST= "stw"; # store $STU= "stwu"; # store and update $UMULL= "mullw"; # unsigned multiply low $UMULH= "mulhwu"; # unsigned multiply high $UDIV= "divwu"; # unsigned divide $UCMPI= "cmplwi"; # unsigned compare with immediate $UCMP= "cmplw"; # unsigned compare $CNTLZ= "cntlzw"; # count leading zeros $SHL= "slw"; # shift left $SHR= "srw"; # unsigned shift right $SHRI= "srwi"; # unsigned shift right by immediate $SHLI= "slwi"; # shift left by immediate $CLRU= "clrlwi"; # clear upper bits $INSR= "insrwi"; # insert right $ROTL= "rotlwi"; # rotate left by immediate $TR= "tw"; # conditional trap} elsif ($opf =~ /64\.s/) { $BITS= 64; $BNSZ= $BITS/8; $ISA= "\"ppc64\""; # same as above, but 64-bit mnemonics... $LD= "ld"; # load $LDU= "ldu"; # load and update $ST= "std"; # store $STU= "stdu"; # store and update $UMULL= "mulld"; # unsigned multiply low $UMULH= "mulhdu"; # unsigned multiply high $UDIV= "divdu"; # unsigned divide $UCMPI= "cmpldi"; # unsigned compare with immediate $UCMP= "cmpld"; # unsigned compare $CNTLZ= "cntlzd"; # count leading zeros $SHL= "sld"; # shift left $SHR= "srd"; # unsigned shift right $SHRI= "srdi"; # unsigned shift right by immediate $SHLI= "sldi"; # shift left by immediate $CLRU= "clrldi"; # clear upper bits $INSR= "insrdi"; # insert right $ROTL= "rotldi"; # rotate left by immediate $TR= "td"; # conditional trap} else { die "nonsense $opf"; }( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";# function entry points from the AIX code## There are other, more elegant, ways to handle this. We (IBM) chose# this approach as it plays well with scripts we run to 'namespace'# OpenSSL .i.e. we add a prefix to all the public symbols so we can# co-exist in the same process with other implementations of OpenSSL.# 'cleverer' ways of doing these substitutions tend to hide data we# need to be obvious.#my @items = ("bn_sqr_comba4", "bn_sqr_comba8", "bn_mul_comba4", "bn_mul_comba8", "bn_sub_words", "bn_add_words", "bn_div_words", "bn_sqr_words", "bn_mul_words", "bn_mul_add_words");if ($opf =~ /linux/) { do_linux(); }elsif ($opf =~ /aix/) { do_aix(); }elsif ($opf =~ /osx/) { do_osx(); }else { do_bsd(); }sub do_linux { $d=&data(); if ($BITS==64) { foreach $t (@items) { $d =~ s/\.$t:/\\t.section\t".opd","aw"\\t.align\t3\\t.globl\t$t\$t:\\t.quad\t.$t,.TOC.\@tocbase,0\\t.size\t$t,24\\t.previous\n\\t.type\t.$t,\@function\\t.globl\t.$t\.$t:/g; } } else { foreach $t (@items) { $d=~s/\.$t/$t/g; } } # hide internal labels to avoid pollution of name table... $d=~s/Lppcasm_/.Lppcasm_/gm; print $d;}sub do_aix { # AIX assembler is smart enough to please the linker without # making us do something special... print &data();}# MacOSX 32 bitsub do_osx { $d=&data(); # Change the bn symbol prefix from '.' to '_' foreach $t (@items) { $d=~s/\.$t/_$t/g; } # Change .machine to something OS X asm will accept $d=~s/\.machine.*/.text/g; $d=~s/\#/;/g; # change comment from '#' to ';' print $d;}# BSD (Untested)sub do_bsd { $d=&data(); foreach $t (@items) { $d=~s/\.$t/_$t/g; } print $d;}sub data { local($data)=<<EOF;#--------------------------------------------------------------------##### File: ppc32.s## Created by: Suresh Chari# IBM Thomas J. Watson Research Library# Hawthorne, NY### Description: Optimized assembly routines for OpenSSL crypto# on the 32 bitPowerPC platform.### Version History## 2. Fixed bn_add,bn_sub and bn_div_words, added comments,# cleaned up code. Also made a single version which can# be used for both the AIX and Linux compilers. See NOTE# below.# 12/05/03 Suresh Chari# (with lots of help from) Andy Polyakov## # 1. Initial version 10/20/02 Suresh Chari### The following file works for the xlc,cc# and gcc compilers.## NOTE: To get the file to link correctly with the gcc compiler# you have to change the names of the routines and remove# the first .(dot) character. This should automatically# be done in the build process.## Hand optimized assembly code for the following routines# # bn_sqr_comba4# bn_sqr_comba8# bn_mul_comba4# bn_mul_comba8# bn_sub_words# bn_add_words# bn_div_words# bn_sqr_words# bn_mul_words# bn_mul_add_words## NOTE: It is possible to optimize this code more for# specific PowerPC or Power architectures. On the Northstar# architecture the optimizations in this file do# NOT provide much improvement.## If you have comments or suggestions to improve code send# me a note at schari\@us.ibm.com##--------------------------------------------------------------------------## Defines to be used in the assembly code.# .set r0,0 # we use it as storage for value of 0.set SP,1 # preserved.set RTOC,2 # preserved .set r3,3 # 1st argument/return value.set r4,4 # 2nd argument/volatile register.set r5,5 # 3rd argument/volatile register.set r6,6 # ....set r7,7.set r8,8.set r9,9.set r10,10.set r11,11.set r12,12.set r13,13 # not used, nor any other "below" it....set BO_IF_NOT,4.set BO_IF,12.set BO_dCTR_NZERO,16.set BO_dCTR_ZERO,18.set BO_ALWAYS,20.set CR0_LT,0;.set CR0_GT,1;.set CR0_EQ,2.set CR1_FX,4;.set CR1_FEX,5;.set CR1_VX,6.set LR,8# Declare function names to be global# NOTE: For gcc these names MUST be changed to remove# the first . i.e. for example change ".bn_sqr_comba4"# to "bn_sqr_comba4". This should be automatically done# in the build. .globl .bn_sqr_comba4 .globl .bn_sqr_comba8 .globl .bn_mul_comba4 .globl .bn_mul_comba8 .globl .bn_sub_words .globl .bn_add_words .globl .bn_div_words .globl .bn_sqr_words .globl .bn_mul_words .globl .bn_mul_add_words # .text section .machine $ISA## NOTE: The following label name should be changed to# "bn_sqr_comba4" i.e. remove the first dot# for the gcc compiler. This should be automatically# done in the build#.align 4.bn_sqr_comba4:## Optimized version of bn_sqr_comba4.## void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: # # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.# Here's the assembly## xor r0,r0,r0 # set r0 = 0. Used in the addze # instructions below #sqr_add_c(a,0,c1,c2,c3) $LD r5,`0*$BNSZ`(r4) $UMULL r9,r5,r5 $UMULH r10,r5,r5 #in first iteration. No need #to add since c1=c2=c3=0. # Note c3(r11) is NOT set to 0 # but will be. $ST r9,`0*$BNSZ`(r3) # r[0]=c1; # sqr_add_c2(a,1,0,c2,c3,c1); $LD r6,`1*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) adde r8,r8,r8 addze r9,r0 # catch carry if any. # r9= r0(=0) and carry addc r10,r7,r10 # now add to temp result. addze r11,r8 # r8 added to r11 which is 0 addze r9,r9 $ST r10,`1*$BNSZ`(r3) #r[1]=c2; #sqr_add_c(a,1,c3,c1,c2) $UMULL r7,r6,r6 $UMULH r8,r6,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 #sqr_add_c2(a,2,0,c3,c1,c2) $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 $ST r11,`2*$BNSZ`(r3) #r[2]=c3 #sqr_add_c2(a,3,0,c1,c2,c3); $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r11,r0 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,2,1,c1,c2,c3); $LD r5,`1*$BNSZ`(r4) $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 $ST r9,`3*$BNSZ`(r3) #r[3]=c1 #sqr_add_c(a,2,c2,c3,c1); $UMULL r7,r6,r6 $UMULH r8,r6,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 #sqr_add_c2(a,3,1,c2,c3,c1); $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 #sqr_add_c2(a,3,2,c3,c1,c2); $LD r5,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r10,r0 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 #sqr_add_c(a,3,c1,c2,c3); $UMULL r7,r6,r6 $UMULH r8,r6,r6 addc r9,r7,r9 adde r10,r8,r10 $ST r9,`6*$BNSZ`(r3) #r[6]=c1 $ST r10,`7*$BNSZ`(r3) #r[7]=c2 bclr BO_ALWAYS,CR0_LT .long 0x00000000## NOTE: The following label name should be changed to# "bn_sqr_comba8" i.e. remove the first dot# for the gcc compiler. This should be automatically# done in the build# .align 4.bn_sqr_comba8:## This is an optimized version of the bn_sqr_comba8 routine.# Tightly uses the adde instruction### void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: # # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.## Possible optimization of loading all 8 longs of a into registers# doesnt provide any speedup# xor r0,r0,r0 #set r0 = 0.Used in addze #instructions below. #sqr_add_c(a,0,c1,c2,c3); $LD r5,`0*$BNSZ`(r4) $UMULL r9,r5,r5 #1st iteration: no carries. $UMULH r10,r5,r5 $ST r9,`0*$BNSZ`(r3) # r[0]=c1; #sqr_add_c2(a,1,0,c2,c3,c1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -