📄 armv4-mont.pl
字号:
#!/usr/bin/env perl# ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. The module is, however, dual licensed under OpenSSL and# CRYPTOGAMS licenses depending on where you obtain it. For further# details see http://www.openssl.org/~appro/cryptogams/.# ====================================================================# January 2007.# Montgomery multiplication for ARMv4.## Performance improvement naturally varies among CPU implementations# and compilers. The code was observed to provide +65-35% improvement# [depending on key length, less for longer keys] on ARM920T, and# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code# base and compiler generated code with in-lined umull and even umlal# instructions. The latter means that this code didn't really have an # "advantage" of utilizing some "secret" instruction.## The code is interoperable with Thumb ISA and is rather compact, less# than 1/2KB. Windows CE port would be trivial, as it's exclusively# about decorations, ABI and instruction syntax are identical.$num="r0"; # starts as num argument, but holds &tp[num-1]$ap="r1";$bp="r2"; $bi="r2"; $rp="r2";$np="r3";$tp="r4";$aj="r5";$nj="r6";$tj="r7";$n0="r8";########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer$alo="r10"; # sl, gcc uses it to keep @GOT$ahi="r11"; # fp$nlo="r12"; # ip########### # r13 is stack pointer$nhi="r14"; # lr########### # r15 is program counter#### argument block layout relative to &tp[num-1], a.k.a. $num$_rp="$num,#12*4";# ap permanently resides in r1$_bp="$num,#13*4";# np permanently resides in r3$_n0="$num,#14*4";$_num="$num,#15*4"; $_bpend=$_num;$code=<<___;.text.global bn_mul_mont.type bn_mul_mont,%function.align 2bn_mul_mont: stmdb sp!,{r0,r2} @ sp points at argument block ldr $num,[sp,#3*4] @ load num cmp $num,#2 movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt stmdb sp!,{r4-r12,lr} @ save 10 registers mov $num,$num,lsl#2 @ rescale $num for byte count sub sp,sp,$num @ alloca(4*num) sub sp,sp,#4 @ +extra dword sub $num,$num,#4 @ "num=num-1" add $tp,$bp,$num @ &bp[num-1] add $num,sp,$num @ $num to point at &tp[num-1] ldr $n0,[$_n0] @ &n0 ldr $bi,[$bp] @ bp[0] ldr $aj,[$ap],#4 @ ap[0],ap++ ldr $nj,[$np],#4 @ np[0],np++ ldr $n0,[$n0] @ *n0 str $tp,[$_bpend] @ save &bp[num] umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0] str $n0,[$_n0] @ save n0 value mul $n0,$alo,$n0 @ "tp[0]"*n0 mov $nlo,#0 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]" mov $tp,sp.L1st: ldr $aj,[$ap],#4 @ ap[j],ap++ mov $alo,$ahi mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 cmp $tp,$num bne .L1st adds $nlo,$nlo,$ahi mov $nhi,#0 adc $nhi,$nhi,#0 ldr $tp,[$_bp] @ restore bp str $nlo,[$num] @ tp[num-1]= ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]=.Louter: sub $tj,$num,sp @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] sub $np,$np,$tj @ "rewind" np to &np[1] ldr $bi,[$tp,#4]! @ *(++bp) ldr $aj,[$ap,#-4] @ ap[0] ldr $nj,[$np,#-4] @ np[0] ldr $alo,[sp] @ tp[0] ldr $tj,[sp,#4] @ tp[1] mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0] str $tp,[$_bp] @ save bp mul $n0,$alo,$n0 mov $nlo,#0 umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]" mov $tp,sp.Linner: ldr $aj,[$ap],#4 @ ap[j],ap++ adds $alo,$ahi,$tj @ +=tp[j] mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 ldr $tj,[$tp,#8] @ tp[j+1] adc $ahi,$ahi,#0 adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 cmp $tp,$num bne .Linner adds $nlo,$nlo,$ahi mov $nhi,#0 adc $nhi,$nhi,#0 adds $nlo,$nlo,$tj adc $nhi,$nhi,#0 ldr $tp,[$_bp] @ restore bp ldr $tj,[$_bpend] @ restore &bp[num] str $nlo,[$num] @ tp[num-1]= ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj bne .Louter ldr $rp,[$_rp] @ pull rp add $num,$num,#4 @ $num to point at &tp[num] sub $aj,$num,sp @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] subs $tj,$tj,$tj @ "clear" carry flag.Lsub: ldr $tj,[$tp],#4 ldr $nj,[$np],#4 sbcs $tj,$tj,$nj @ tp[j]-np[j] str $tj,[$rp],#4 @ rp[j]= teq $tp,$num @ preserve carry bne .Lsub sbcs $nhi,$nhi,#0 @ upmost carry mov $tp,sp @ "rewind" $tp sub $rp,$rp,$aj @ "rewind" $rp and $ap,$tp,$nhi bic $np,$rp,$nhi orr $ap,$ap,$np @ ap=borrow?tp:rp.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh str sp,[$tp],#4 @ zap tp str $tj,[$rp],#4 cmp $tp,$num bne .Lcopy add sp,$num,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1.Labrt: tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-).size bn_mul_mont,.-bn_mul_mont.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"___$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4print $code;close STDOUT;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -