📄 aix_ppc32.s
字号:
#--------------------------------------------------------------------##### File: ppc32.s## Created by: Suresh Chari# IBM Thomas J. Watson Research Library# Hawthorne, NY### Description: Optimized assembly routines for OpenSSL crypto# on the 32 bitPowerPC platform.### Version History## 2. Fixed bn_add,bn_sub and bn_div_words, added comments,# cleaned up code. Also made a single version which can# be used for both the AIX and Linux compilers. See NOTE# below.# 12/05/03 Suresh Chari# (with lots of help from) Andy Polyakov## # 1. Initial version 10/20/02 Suresh Chari### The following file works for the xlc,cc# and gcc compilers.## NOTE: To get the file to link correctly with the gcc compiler# you have to change the names of the routines and remove# the first .(dot) character. This should automatically# be done in the build process.## Hand optimized assembly code for the following routines# # bn_sqr_comba4# bn_sqr_comba8# bn_mul_comba4# bn_mul_comba8# bn_sub_words# bn_add_words# bn_div_words# bn_sqr_words# bn_mul_words# bn_mul_add_words## NOTE: It is possible to optimize this code more for# specific PowerPC or Power architectures. On the Northstar# architecture the optimizations in this file do# NOT provide much improvement.## If you have comments or suggestions to improve code send# me a note at schari@us.ibm.com##--------------------------------------------------------------------------## Defines to be used in the assembly code.# .set r0,0 # we use it as storage for value of 0.set SP,1 # preserved.set RTOC,2 # preserved .set r3,3 # 1st argument/return value.set r4,4 # 2nd argument/volatile register.set r5,5 # 3rd argument/volatile register.set r6,6 # ....set r7,7.set r8,8.set r9,9.set r10,10.set r11,11.set r12,12.set r13,13 # not used, nor any other "below" it....set BO_IF_NOT,4.set BO_IF,12.set BO_dCTR_NZERO,16.set BO_dCTR_ZERO,18.set BO_ALWAYS,20.set CR0_LT,0;.set CR0_GT,1;.set CR0_EQ,2.set CR1_FX,4;.set CR1_FEX,5;.set CR1_VX,6.set LR,8# Declare function names to be global# NOTE: For gcc these names MUST be changed to remove# the first . i.e. for example change ".bn_sqr_comba4"# to "bn_sqr_comba4". This should be automatically done# in the build. .globl .bn_sqr_comba4 .globl .bn_sqr_comba8 .globl .bn_mul_comba4 .globl .bn_mul_comba8 .globl .bn_sub_words .globl .bn_add_words .globl .bn_div_words .globl .bn_sqr_words .globl .bn_mul_words .globl .bn_mul_add_words # .text section .machine "ppc"## NOTE: The following label name should be changed to# "bn_sqr_comba4" i.e. remove the first dot# for the gcc compiler. This should be automatically# done in the build#.align 4.bn_sqr_comba4:## Optimized version of bn_sqr_comba4.## void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: # # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.# Here's the assembly## xor r0,r0,r0 # set r0 = 0. Used in the addze # instructions below #sqr_add_c(a,0,c1,c2,c3) lwz r5,0(r4) mullw r9,r5,r5 mulhwu r10,r5,r5 #in first iteration. No need #to add since c1=c2=c3=0. # Note c3(r11) is NOT set to 0 # but will be. stw r9,0(r3) # r[0]=c1; # sqr_add_c2(a,1,0,c2,c3,c1); lwz r6,4(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) adde r8,r8,r8 addze r9,r0 # catch carry if any. # r9= r0(=0) and carry addc r10,r7,r10 # now add to temp result. addze r11,r8 # r8 added to r11 which is 0 addze r9,r9 stw r10,4(r3) #r[1]=c2; #sqr_add_c(a,1,c3,c1,c2) mullw r7,r6,r6 mulhwu r8,r6,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 #sqr_add_c2(a,2,0,c3,c1,c2) lwz r6,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 stw r11,8(r3) #r[2]=c3 #sqr_add_c2(a,3,0,c1,c2,c3); lwz r6,12(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r11,r0 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,2,1,c1,c2,c3); lwz r5,4(r4) lwz r6,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 stw r9,12(r3) #r[3]=c1 #sqr_add_c(a,2,c2,c3,c1); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 #sqr_add_c2(a,3,1,c2,c3,c1); lwz r6,12(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 stw r10,16(r3) #r[4]=c2 #sqr_add_c2(a,3,2,c3,c1,c2); lwz r5,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r10,r0 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 stw r11,20(r3) #r[5] = c3 #sqr_add_c(a,3,c1,c2,c3); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r9,r7,r9 adde r10,r8,r10 stw r9,24(r3) #r[6]=c1 stw r10,28(r3) #r[7]=c2 bclr BO_ALWAYS,CR0_LT .long 0x00000000## NOTE: The following label name should be changed to# "bn_sqr_comba8" i.e. remove the first dot# for the gcc compiler. This should be automatically# done in the build# .align 4.bn_sqr_comba8:## This is an optimized version of the bn_sqr_comba8 routine.# Tightly uses the adde instruction### void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)# r3 contains r# r4 contains a## Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: # # r5,r6 are the two BN_ULONGs being multiplied.# r7,r8 are the results of the 32x32 giving 64 bit multiply.# r9,r10, r11 are the equivalents of c1,c2, c3.## Possible optimization of loading all 8 longs of a into registers# doesnt provide any speedup# xor r0,r0,r0 #set r0 = 0.Used in addze #instructions below. #sqr_add_c(a,0,c1,c2,c3); lwz r5,0(r4) mullw r9,r5,r5 #1st iteration: no carries. mulhwu r10,r5,r5 stw r9,0(r3) # r[0]=c1; #sqr_add_c2(a,1,0,c2,c3,c1); lwz r6,4(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 #add the two register number adde r11,r8,r0 # (r8,r7) to the three register addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 addc r10,r7,r10 #add the two register number adde r11,r8,r11 # (r8,r7) to the three register addze r9,r9 # number (r9,r11,r10). stw r10,4(r3) # r[1]=c2 #sqr_add_c(a,1,c3,c1,c2); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 #sqr_add_c2(a,2,0,c3,c1,c2); lwz r6,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 stw r11,8(r3) #r[2]=c3 #sqr_add_c2(a,3,0,c1,c2,c3); lwz r6,12(r4) #r6 = a[3]. r5 is already a[0]. mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r0 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,2,1,c1,c2,c3); lwz r5,4(r4) lwz r6,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 stw r9,12(r3) #r[3]=c1; #sqr_add_c(a,2,c2,c3,c1); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 #sqr_add_c2(a,3,1,c2,c3,c1); lwz r6,12(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 #sqr_add_c2(a,4,0,c2,c3,c1); lwz r5,0(r4) lwz r6,16(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 stw r10,16(r3) #r[4]=c2; #sqr_add_c2(a,5,0,c3,c1,c2); lwz r6,20(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 #sqr_add_c2(a,4,1,c3,c1,c2); lwz r5,4(r4) lwz r6,16(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 #sqr_add_c2(a,3,2,c3,c1,c2); lwz r5,8(r4) lwz r6,12(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 stw r11,20(r3) #r[5]=c3; #sqr_add_c(a,3,c1,c2,c3); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r0 #sqr_add_c2(a,4,2,c1,c2,c3); lwz r6,16(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,5,1,c1,c2,c3); lwz r5,4(r4) lwz r6,20(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,6,0,c1,c2,c3); lwz r5,0(r4) lwz r6,24(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 stw r9,24(r3) #r[6]=c1; #sqr_add_c2(a,7,0,c2,c3,c1); lwz r6,28(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 #sqr_add_c2(a,6,1,c2,c3,c1); lwz r5,4(r4) lwz r6,24(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 #sqr_add_c2(a,5,2,c2,c3,c1); lwz r5,8(r4) lwz r6,20(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 #sqr_add_c2(a,4,3,c2,c3,c1); lwz r5,12(r4) lwz r6,16(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 stw r10,28(r3) #r[7]=c2; #sqr_add_c(a,4,c3,c1,c2); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 #sqr_add_c2(a,5,3,c3,c1,c2); lwz r6,20(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 #sqr_add_c2(a,6,2,c3,c1,c2); lwz r5,8(r4) lwz r6,24(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 #sqr_add_c2(a,7,1,c3,c1,c2); lwz r5,4(r4) lwz r6,28(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 stw r11,32(r3) #r[8]=c3; #sqr_add_c2(a,7,2,c1,c2,c3); lwz r5,8(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r0 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,6,3,c1,c2,c3); lwz r5,12(r4) lwz r6,24(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 #sqr_add_c2(a,5,4,c1,c2,c3); lwz r5,16(r4) lwz r6,20(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 stw r9,36(r3) #r[9]=c1; #sqr_add_c(a,5,c2,c3,c1); mullw r7,r6,r6 mulhwu r8,r6,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 #sqr_add_c2(a,6,4,c2,c3,c1); lwz r6,24(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 #sqr_add_c2(a,7,3,c2,c3,c1); lwz r5,12(r4) lwz r6,28(r4) mullw r7,r5,r6 mulhwu r8,r5,r6 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 stw r10,40(r3) #r[10]=c2; #sqr_add_c2(a,7,4,c3,c1,c2); lwz r5,16(r4) mullw r7,r5,r6
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -