📄 aes-x86_64.pl
字号:
#!/usr/bin/env perl## ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. The module is, however, dual licensed under OpenSSL and# CRYPTOGAMS licenses depending on where you obtain it. For further# details see http://www.openssl.org/~appro/cryptogams/.# ====================================================================## Version 2.1.## aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version# [you'll notice a lot of resemblance], such as compressed S-boxes# in little-endian byte order, prefetch of these tables in CBC mode,# as well as avoiding L1 cache aliasing between stack frame and key# schedule and already mentioned tables, compressed Td4...## Performance in number of cycles per processed byte for 128-bit key:## ECB encrypt ECB decrypt CBC large chunk# AMD64 33 41 13.0# EM64T 38 59 18.6(*)# Core 2 30 43 14.5(*)## (*) with hyper-threading off$verticalspin=1; # unlike 32-bit version $verticalspin performs # ~15% better on both AMD and Intel cores$speed_limit=512; # see aes-586.pl for details$output=shift;$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) ordie "can't locate x86_64-xlate.pl";open STDOUT,"| $^X $xlate $output";$code=".text\n";$s0="%eax";$s1="%ebx";$s2="%ecx";$s3="%edx";$acc0="%esi"; $mask80="%rsi";$acc1="%edi"; $maskfe="%rdi";$acc2="%ebp"; $mask1b="%rbp";$inp="%r8";$out="%r9";$t0="%r10d";$t1="%r11d";$t2="%r12d";$rnds="%r13d";$sbox="%r14";$key="%r15";sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r =~ s/%[er]([sd]i)/%\1l/; $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; $r =~ s/%r([0-9]+)/%r\1d/; $r; }sub _data_word(){ my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }}sub data_word(){ my $i; my $last=pop(@_); $code.=".long\t"; while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } $code.=sprintf"0x%08x\n",$last;}sub data_byte(){ my $i; my $last=pop(@_); $code.=".byte\t"; while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } $code.=sprintf"0x%02x\n",$last&0xff;}sub encvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; # favor 3-way issue Opteron pipeline... movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 mov 0($sbox,$acc0,8),$t0 mov 0($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t2 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 movzb `&lo("$s3")`,$acc2 xor 3($sbox,$acc0,8),$t0 xor 3($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t3 movzb `&hi("$s3")`,$acc0 shr \$16,$s2 movzb `&hi("$s0")`,$acc2 xor 3($sbox,$acc0,8),$t2 shr \$16,$s3 xor 3($sbox,$acc2,8),$t3 shr \$16,$s1 lea 16($key),$key shr \$16,$s0 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 xor 2($sbox,$acc0,8),$t0 xor 2($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 movzb `&lo("$s1")`,$acc2 xor 1($sbox,$acc0,8),$t0 xor 1($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t3 mov 12($key),$s3 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 mov 0($key),$s0 xor 1($sbox,$acc1,8),$t2 xor 1($sbox,$acc2,8),$t3 mov 4($key),$s1 mov 8($key),$s2 xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub enclastvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 movzb 2($sbox,$acc0,8),$t0 movzb 2($sbox,$acc1,8),$t1 movzb 2($sbox,$acc2,8),$t2 movzb `&lo("$s3")`,$acc0 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 movzb 2($sbox,$acc0,8),$t3 mov 0($sbox,$acc1,8),$acc1 #$t0 mov 0($sbox,$acc2,8),$acc2 #$t1 and \$0x0000ff00,$acc1 and \$0x0000ff00,$acc2 xor $acc1,$t0 xor $acc2,$t1 shr \$16,$s2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 shr \$16,$s3 mov 0($sbox,$acc0,8),$acc0 #$t2 mov 0($sbox,$acc1,8),$acc1 #$t3 and \$0x0000ff00,$acc0 and \$0x0000ff00,$acc1 shr \$16,$s1 xor $acc0,$t2 xor $acc1,$t3 shr \$16,$s0 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 mov 0($sbox,$acc0,8),$acc0 #$t0 mov 0($sbox,$acc1,8),$acc1 #$t1 mov 0($sbox,$acc2,8),$acc2 #$t2 and \$0x00ff0000,$acc0 and \$0x00ff0000,$acc1 and \$0x00ff0000,$acc2 xor $acc0,$t0 xor $acc1,$t1 xor $acc2,$t2 movzb `&lo("$s1")`,$acc0 movzb `&hi("$s3")`,$acc1 movzb `&hi("$s0")`,$acc2 mov 0($sbox,$acc0,8),$acc0 #$t3 mov 2($sbox,$acc1,8),$acc1 #$t0 mov 2($sbox,$acc2,8),$acc2 #$t1 and \$0x00ff0000,$acc0 and \$0xff000000,$acc1 and \$0xff000000,$acc2 xor $acc0,$t3 xor $acc1,$t0 xor $acc2,$t1 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 mov 16+12($key),$s3 mov 2($sbox,$acc0,8),$acc0 #$t2 mov 2($sbox,$acc1,8),$acc1 #$t3 mov 16+0($key),$s0 and \$0xff000000,$acc0 and \$0xff000000,$acc1 xor $acc0,$t2 xor $acc1,$t3 mov 16+4($key),$s1 mov 16+8($key),$s2 xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub encstep(){ my ($i,@s) = @_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; if ($i==3) { $tmp0=$s[1]; $tmp1=$s[2]; $tmp2=$s[3]; } $code.=" movzb ".&lo($s[0]).",$out\n"; $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" lea 16($key),$key\n" if ($i==0); $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" mov 0($sbox,$out,8),$out\n"; $code.=" shr \$16,$tmp1\n"; $code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" xor 3($sbox,$tmp0,8),$out\n"; $code.=" movzb ".&lo($tmp1).",$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" xor 4*$i($key),$out\n"; $code.=" xor 2($sbox,$tmp1,8),$out\n"; $code.=" xor 1($sbox,$tmp2,8),$out\n"; $code.=" mov $t0,$s[1]\n" if ($i==3); $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" mov $t2,$s[3]\n" if ($i==3); $code.="\n";}sub enclast(){ my ($i,@s)=@_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; if ($i==3) { $tmp0=$s[1]; $tmp1=$s[2]; $tmp2=$s[3]; } $code.=" movzb ".&lo($s[0]).",$out\n"; $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov 2($sbox,$out,8),$out\n"; $code.=" shr \$16,$tmp1\n"; $code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" and \$0x000000ff,$out\n"; $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" movzb ".&lo($tmp1).",$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; $code.=" and \$0x0000ff00,$tmp0\n"; $code.=" and \$0x00ff0000,$tmp1\n"; $code.=" and \$0xff000000,$tmp2\n"; $code.=" xor $tmp0,$out\n"; $code.=" mov $t0,$s[1]\n" if ($i==3); $code.=" xor $tmp1,$out\n"; $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" xor $tmp2,$out\n"; $code.=" mov $t2,$s[3]\n" if ($i==3); $code.="\n";}$code.=<<___;.type _x86_64_AES_encrypt,\@abi-omnipotent.align 16_x86_64_AES_encrypt: xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 mov 240($key),$rnds # load key->rounds sub \$1,$rnds jmp .Lenc_loop.align 16.Lenc_loop:___ if ($verticalspin) { &encvert(); } else { &encstep(0,$s0,$s1,$s2,$s3); &encstep(1,$s1,$s2,$s3,$s0); &encstep(2,$s2,$s3,$s0,$s1); &encstep(3,$s3,$s0,$s1,$s2); }$code.=<<___; sub \$1,$rnds jnz .Lenc_loop___ if ($verticalspin) { &enclastvert(); } else { &enclast(0,$s0,$s1,$s2,$s3); &enclast(1,$s1,$s2,$s3,$s0); &enclast(2,$s2,$s3,$s0,$s1); &enclast(3,$s3,$s0,$s1,$s2); $code.=<<___; xor 16+0($key),$s0 # xor with key xor 16+4($key),$s1 xor 16+8($key),$s2 xor 16+12($key),$s3___ }$code.=<<___; .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt___# it's possible to implement this by shifting tN by 8, filling least# significant byte with byte load and finally bswap-ing at the end,# but such partial register load kills Core 2...sub enccompactvert(){ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");$code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s3")`,$acc2 movzb `&hi("$s0")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shr \$16,$s0 shr \$16,$s1 movzb `&lo("$s0")`,$t5 shl \$8,$acc2 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 xor $acc2,$t2 xor $acc0,$t3 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s3")`,$acc0 shl \$16,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 movzb `&hi("$s0")`,$acc1 shr \$8,$s2 shr \$8,$s1 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$s2,1),$s3 #$t3 movzb ($sbox,$s1,1),$s2 #$t2 shl \$16,$t4 shl \$16,$t5 shl \$16,$acc2 xor $t4,$t1 xor $t5,$t2 xor $acc2,$t3 shl \$24,$acc0 shl \$24,$acc1 shl \$24,$s3 xor $acc0,$t0 shl \$24,$s2 xor $acc1,$t1 mov $t0,$s0 mov $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub enctransform_ref(){ my $sn = shift; my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");$code.=<<___; mov $sn,$acc and \$0x80808080,$acc mov $acc,$tmp shr \$7,$tmp lea ($sn,$sn),$r2 sub $tmp,$acc and \$0xfefefefe,$r2 and \$0x1b1b1b1b,$acc mov $sn,$tmp xor $acc,$r2 xor $r2,$sn rol \$24,$sn xor $r2,$sn ror \$16,$tmp xor $tmp,$sn ror \$8,$tmp xor $tmp,$sn___}# unlike decrypt case it does not pay off to parallelize enctransformsub enctransform(){ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");$code.=<<___; mov $s0,$acc0 mov $s1,$acc1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 mov $acc0,$t0 mov $acc1,$t1 shr \$7,$t0 lea ($s0,$s0),$r20 shr \$7,$t1 lea ($s1,$s1),$r21 sub $t0,$acc0 sub $t1,$acc1 and \$0xfefefefe,$r20 and \$0xfefefefe,$r21 and \$0x1b1b1b1b,$acc0 and \$0x1b1b1b1b,$acc1 mov $s0,$t0 mov $s1,$t1 xor $acc0,$r20 xor $acc1,$r21 xor $r20,$s0 xor $r21,$s1 mov $s2,$acc0 mov $s3,$acc1 rol \$24,$s0 rol \$24,$s1 and \$0x80808080,$acc0 and \$0x80808080,$acc1 xor $r20,$s0 xor $r21,$s1 mov $acc0,$t2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -