📄 aes-x86_64.pl
字号:
#!/usr/bin/env perl## ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. Rights for redistribution and usage in source and binary# forms are granted according to the OpenSSL license.# ====================================================================## Version 1.2.## aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version# [you'll notice a lot of resemblance], such as compressed S-boxes# in little-endian byte order, prefetch of these tables in CBC mode,# as well as avoiding L1 cache aliasing between stack frame and key# schedule and already mentioned tables, compressed Td4...## Performance in number of cycles per processed byte for 128-bit key:## ECB CBC encrypt# AMD64 13.7 13.0(*)# EM64T 20.2 18.6(*)## (*) CBC benchmarks are better than ECB thanks to custom ABI used# by the private block encryption function.$verticalspin=1; # unlike 32-bit version $verticalspin performs # ~15% better on both AMD and Intel cores$output=shift;open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";$code=".text\n";$s0="%eax";$s1="%ebx";$s2="%ecx";$s3="%edx";$acc0="%esi";$acc1="%edi";$acc2="%ebp";$inp="%r8";$out="%r9";$t0="%r10d";$t1="%r11d";$t2="%r12d";$rnds="%r13d";$sbox="%r14";$key="%r15";sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r =~ s/%[er]([sd]i)/%\1l/; $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }sub _data_word(){ my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }}sub data_word(){ my $i; my $last=pop(@_); $code.=".long\t"; while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } $code.=sprintf"0x%08x\n",$last;}sub data_byte(){ my $i; my $last=pop(@_); $code.=".byte\t"; while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } $code.=sprintf"0x%02x\n",$last&0xff;}sub encvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; # favor 3-way issue Opteron pipeline... movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 mov 0($sbox,$acc0,8),$t0 mov 0($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t2 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 movzb `&lo("$s3")`,$acc2 xor 3($sbox,$acc0,8),$t0 xor 3($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t3 movzb `&hi("$s3")`,$acc0 shr \$16,$s2 movzb `&hi("$s0")`,$acc2 xor 3($sbox,$acc0,8),$t2 shr \$16,$s3 xor 3($sbox,$acc2,8),$t3 shr \$16,$s1 lea 16($key),$key shr \$16,$s0 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 xor 2($sbox,$acc0,8),$t0 xor 2($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 movzb `&lo("$s1")`,$acc2 xor 1($sbox,$acc0,8),$t0 xor 1($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t3 mov 12($key),$s3 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 mov 0($key),$s0 xor 1($sbox,$acc1,8),$t2 xor 1($sbox,$acc2,8),$t3 mov 4($key),$s1 mov 8($key),$s2 xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub enclastvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 mov 2($sbox,$acc0,8),$t0 mov 2($sbox,$acc1,8),$t1 mov 2($sbox,$acc2,8),$t2 and \$0x000000ff,$t0 and \$0x000000ff,$t1 and \$0x000000ff,$t2 movzb `&lo("$s3")`,$acc0 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 mov 2($sbox,$acc0,8),$t3 mov 0($sbox,$acc1,8),$acc1 #$t0 mov 0($sbox,$acc2,8),$acc2 #$t1 and \$0x000000ff,$t3 and \$0x0000ff00,$acc1 and \$0x0000ff00,$acc2 xor $acc1,$t0 xor $acc2,$t1 shr \$16,$s2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 shr \$16,$s3 mov 0($sbox,$acc0,8),$acc0 #$t2 mov 0($sbox,$acc1,8),$acc1 #$t3 and \$0x0000ff00,$acc0 and \$0x0000ff00,$acc1 shr \$16,$s1 xor $acc0,$t2 xor $acc1,$t3 shr \$16,$s0 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 mov 0($sbox,$acc0,8),$acc0 #$t0 mov 0($sbox,$acc1,8),$acc1 #$t1 mov 0($sbox,$acc2,8),$acc2 #$t2 and \$0x00ff0000,$acc0 and \$0x00ff0000,$acc1 and \$0x00ff0000,$acc2 xor $acc0,$t0 xor $acc1,$t1 xor $acc2,$t2 movzb `&lo("$s1")`,$acc0 movzb `&hi("$s3")`,$acc1 movzb `&hi("$s0")`,$acc2 mov 0($sbox,$acc0,8),$acc0 #$t3 mov 2($sbox,$acc1,8),$acc1 #$t0 mov 2($sbox,$acc2,8),$acc2 #$t1 and \$0x00ff0000,$acc0 and \$0xff000000,$acc1 and \$0xff000000,$acc2 xor $acc0,$t3 xor $acc1,$t0 xor $acc2,$t1 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 mov 16+12($key),$s3 mov 2($sbox,$acc0,8),$acc0 #$t2 mov 2($sbox,$acc1,8),$acc1 #$t3 mov 16+0($key),$s0 and \$0xff000000,$acc0 and \$0xff000000,$acc1 xor $acc0,$t2 xor $acc1,$t3 mov 16+4($key),$s1 mov 16+8($key),$s2 xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub encstep(){ my ($i,@s) = @_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; if ($i==3) { $tmp0=$s[1]; $tmp1=$s[2]; $tmp2=$s[3]; } $code.=" movzb ".&lo($s[0]).",$out\n"; $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" lea 16($key),$key\n" if ($i==0); $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" mov 0($sbox,$out,8),$out\n"; $code.=" shr \$16,$tmp1\n"; $code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" xor 3($sbox,$tmp0,8),$out\n"; $code.=" movzb ".&lo($tmp1).",$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" xor 4*$i($key),$out\n"; $code.=" xor 2($sbox,$tmp1,8),$out\n"; $code.=" xor 1($sbox,$tmp2,8),$out\n"; $code.=" mov $t0,$s[1]\n" if ($i==3); $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" mov $t2,$s[3]\n" if ($i==3); $code.="\n";}sub enclast(){ my ($i,@s)=@_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; if ($i==3) { $tmp0=$s[1]; $tmp1=$s[2]; $tmp2=$s[3]; } $code.=" movzb ".&lo($s[0]).",$out\n"; $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov 2($sbox,$out,8),$out\n"; $code.=" shr \$16,$tmp1\n"; $code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" and \$0x000000ff,$out\n"; $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" movzb ".&lo($tmp1).",$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; $code.=" and \$0x0000ff00,$tmp0\n"; $code.=" and \$0x00ff0000,$tmp1\n"; $code.=" and \$0xff000000,$tmp2\n"; $code.=" xor $tmp0,$out\n"; $code.=" mov $t0,$s[1]\n" if ($i==3); $code.=" xor $tmp1,$out\n"; $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" xor $tmp2,$out\n"; $code.=" mov $t2,$s[3]\n" if ($i==3); $code.="\n";}$code.=<<___;.type _x86_64_AES_encrypt,\@abi-omnipotent.align 16_x86_64_AES_encrypt: xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 mov 240($key),$rnds # load key->rounds sub \$1,$rnds jmp .Lenc_loop.align 16.Lenc_loop:___ if ($verticalspin) { &encvert(); } else { &encstep(0,$s0,$s1,$s2,$s3); &encstep(1,$s1,$s2,$s3,$s0); &encstep(2,$s2,$s3,$s0,$s1); &encstep(3,$s3,$s0,$s1,$s2); }$code.=<<___; sub \$1,$rnds jnz .Lenc_loop___ if ($verticalspin) { &enclastvert(); } else { &enclast(0,$s0,$s1,$s2,$s3); &enclast(1,$s1,$s2,$s3,$s0); &enclast(2,$s2,$s3,$s0,$s1); &enclast(3,$s3,$s0,$s1,$s2); $code.=<<___; xor 16+0($key),$s0 # xor with key xor 16+4($key),$s1 xor 16+8($key),$s2 xor 16+12($key),$s3___ }$code.=<<___; .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt___# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);$code.=<<___;.globl AES_encrypt.type AES_encrypt,\@function,3.align 16AES_encrypt: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 mov %rdx,$key mov %rdi,$inp mov %rsi,$out .picmeup $sbox lea AES_Te-.($sbox),$sbox mov 0($inp),$s0 mov 4($inp),$s1 mov 8($inp),$s2 mov 12($inp),$s3 call _x86_64_AES_encrypt mov $s0,0($out) mov $s1,4($out) mov $s2,8($out) mov $s3,12($out) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx ret.size AES_encrypt,.-AES_encrypt___#------------------------------------------------------------------#sub decvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; # favor 3-way issue Opteron pipeline... movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 mov 0($sbox,$acc0,8),$t0 mov 0($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 movzb `&lo("$s3")`,$acc2 xor 3($sbox,$acc0,8),$t0 xor 3($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t3 movzb `&hi("$s1")`,$acc0 shr \$16,$s0 movzb `&hi("$s2")`,$acc2 xor 3($sbox,$acc0,8),$t2 shr \$16,$s3 xor 3($sbox,$acc2,8),$t3 shr \$16,$s1 lea 16($key),$key shr \$16,$s2 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 xor 2($sbox,$acc0,8),$t0 xor 2($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t2 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 movzb `&lo("$s1")`,$acc2 xor 1($sbox,$acc0,8),$t0 xor 1($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t3 movzb `&hi("$s3")`,$acc0 mov 12($key),$s3 movzb `&hi("$s0")`,$acc2 xor 1($sbox,$acc0,8),$t2 mov 0($key),$s0 xor 1($sbox,$acc2,8),$t3 xor $t0,$s0 mov 4($key),$s1 mov 8($key),$s2 xor $t2,$s2 xor $t1,$s1 xor $t3,$s3___}sub declastvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 movzb 2048($sbox,$acc0,1),$t0 movzb 2048($sbox,$acc1,1),$t1 movzb 2048($sbox,$acc2,1),$t2 movzb `&lo("$s3")`,$acc0 movzb `&hi("$s3")`,$acc1 movzb `&hi("$s0")`,$acc2 movzb 2048($sbox,$acc0,1),$t3 movzb 2048($sbox,$acc1,1),$acc1 #$t0 movzb 2048($sbox,$acc2,1),$acc2 #$t1 shl \$8,$acc1 shl \$8,$acc2 xor $acc1,$t0 xor $acc2,$t1 shr \$16,$s3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 shr \$16,$s0 movzb 2048($sbox,$acc0,1),$acc0 #$t2 movzb 2048($sbox,$acc1,1),$acc1 #$t3 shl \$8,$acc0 shl \$8,$acc1 shr \$16,$s1 xor $acc0,$t2 xor $acc1,$t3 shr \$16,$s2 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 movzb 2048($sbox,$acc0,1),$acc0 #$t0 movzb 2048($sbox,$acc1,1),$acc1 #$t1 movzb 2048($sbox,$acc2,1),$acc2 #$t2 shl \$16,$acc0 shl \$16,$acc1 shl \$16,$acc2 xor $acc0,$t0 xor $acc1,$t1 xor $acc2,$t2 movzb `&lo("$s1")`,$acc0 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 movzb 2048($sbox,$acc0,1),$acc0 #$t3 movzb 2048($sbox,$acc1,1),$acc1 #$t0 movzb 2048($sbox,$acc2,1),$acc2 #$t1 shl \$16,$acc0 shl \$24,$acc1 shl \$24,$acc2 xor $acc0,$t3 xor $acc1,$t0 xor $acc2,$t1 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 mov 16+12($key),$s3 movzb 2048($sbox,$acc0,1),$acc0 #$t2 movzb 2048($sbox,$acc1,1),$acc1 #$t3 mov 16+0($key),$s0 shl \$24,$acc0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -