📄 aes-x86_64.pl
字号:
mov $acc1,$t3 ror \$16,$t0 ror \$16,$t1 shr \$7,$t2 lea ($s2,$s2),$r20 xor $t0,$s0 xor $t1,$s1 shr \$7,$t3 lea ($s3,$s3),$r21 ror \$8,$t0 ror \$8,$t1 sub $t2,$acc0 sub $t3,$acc1 xor $t0,$s0 xor $t1,$s1 and \$0xfefefefe,$r20 and \$0xfefefefe,$r21 and \$0x1b1b1b1b,$acc0 and \$0x1b1b1b1b,$acc1 mov $s2,$t2 mov $s3,$t3 xor $acc0,$r20 xor $acc1,$r21 xor $r20,$s2 xor $r21,$s3 rol \$24,$s2 rol \$24,$s3 xor $r20,$s2 xor $r21,$s3 mov 0($sbox),$acc0 # prefetch Te4 ror \$16,$t2 ror \$16,$t3 mov 64($sbox),$acc1 xor $t2,$s2 xor $t3,$s3 mov 128($sbox),$r20 ror \$8,$t2 ror \$8,$t3 mov 192($sbox),$r21 xor $t2,$s2 xor $t3,$s3___}$code.=<<___;.type _x86_64_AES_encrypt_compact,\@abi-omnipotent.align 16_x86_64_AES_encrypt_compact: lea 128($sbox),$inp # size optimization mov 0-128($inp),$acc1 # prefetch Te4 mov 32-128($inp),$acc2 mov 64-128($inp),$t0 mov 96-128($inp),$t1 mov 128-128($inp),$acc1 mov 160-128($inp),$acc2 mov 192-128($inp),$t0 mov 224-128($inp),$t1 jmp .Lenc_loop_compact.align 16.Lenc_loop_compact: xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 lea 16($key),$key___ &enccompactvert();$code.=<<___; cmp 16(%rsp),$key je .Lenc_compact_done___ &enctransform();$code.=<<___; jmp .Lenc_loop_compact.align 16.Lenc_compact_done: xor 0($key),$s0 xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact___# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);$code.=<<___;.globl AES_encrypt.type AES_encrypt,\@function,3.align 16AES_encrypt: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 # allocate frame "above" key schedule mov %rsp,%rax mov %rdx,$key lea -63(%rdx),%rcx and \$-64,%rsp sub %rsp,%rcx neg %rcx and \$0x3c0,%rcx sub %rcx,%rsp push %rax # save real stack pointer push %rsi # save out mov 240($key),$rnds # load rounds mov 0(%rdi),$s0 # load input vector mov 4(%rdi),$s1 mov 8(%rdi),$s2 mov 12(%rdi),$s3 shl \$4,$rnds lea ($key,$rnds),%rbp push %rbp push $key # pick Te4 copy which can't "overlap" with stack frame or key schedule .picmeup $sbox lea AES_Te+2048-.($sbox),$sbox lea 768(%rsp),%rbp sub $sbox,%rbp and \$0x300,%rbp lea ($sbox,%rbp),$sbox call _x86_64_AES_encrypt_compact mov 16(%rsp),$out # restore out mov 24(%rsp),%rsp mov $s0,0($out) # write output vector mov $s1,4($out) mov $s2,8($out) mov $s3,12($out) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx ret.size AES_encrypt,.-AES_encrypt___#------------------------------------------------------------------#sub decvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; # favor 3-way issue Opteron pipeline... movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 mov 0($sbox,$acc0,8),$t0 mov 0($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t2 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 movzb `&lo("$s3")`,$acc2 xor 3($sbox,$acc0,8),$t0 xor 3($sbox,$acc1,8),$t1 mov 0($sbox,$acc2,8),$t3 movzb `&hi("$s1")`,$acc0 shr \$16,$s0 movzb `&hi("$s2")`,$acc2 xor 3($sbox,$acc0,8),$t2 shr \$16,$s3 xor 3($sbox,$acc2,8),$t3 shr \$16,$s1 lea 16($key),$key shr \$16,$s2 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 xor 2($sbox,$acc0,8),$t0 xor 2($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t2 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 movzb `&lo("$s1")`,$acc2 xor 1($sbox,$acc0,8),$t0 xor 1($sbox,$acc1,8),$t1 xor 2($sbox,$acc2,8),$t3 movzb `&hi("$s3")`,$acc0 mov 12($key),$s3 movzb `&hi("$s0")`,$acc2 xor 1($sbox,$acc0,8),$t2 mov 0($key),$s0 xor 1($sbox,$acc2,8),$t3 xor $t0,$s0 mov 4($key),$s1 mov 8($key),$s2 xor $t2,$s2 xor $t1,$s1 xor $t3,$s3___}sub declastvert(){ my $t3="%r8d"; # zaps $inp!$code.=<<___; lea 2048($sbox),$sbox # size optimization movzb `&lo("$s0")`,$acc0 movzb `&lo("$s1")`,$acc1 movzb `&lo("$s2")`,$acc2 movzb ($sbox,$acc0,1),$t0 movzb ($sbox,$acc1,1),$t1 movzb ($sbox,$acc2,1),$t2 movzb `&lo("$s3")`,$acc0 movzb `&hi("$s3")`,$acc1 movzb `&hi("$s0")`,$acc2 movzb ($sbox,$acc0,1),$t3 movzb ($sbox,$acc1,1),$acc1 #$t0 movzb ($sbox,$acc2,1),$acc2 #$t1 shl \$8,$acc1 shl \$8,$acc2 xor $acc1,$t0 xor $acc2,$t1 shr \$16,$s3 movzb `&hi("$s1")`,$acc0 movzb `&hi("$s2")`,$acc1 shr \$16,$s0 movzb ($sbox,$acc0,1),$acc0 #$t2 movzb ($sbox,$acc1,1),$acc1 #$t3 shl \$8,$acc0 shl \$8,$acc1 shr \$16,$s1 xor $acc0,$t2 xor $acc1,$t3 shr \$16,$s2 movzb `&lo("$s2")`,$acc0 movzb `&lo("$s3")`,$acc1 movzb `&lo("$s0")`,$acc2 movzb ($sbox,$acc0,1),$acc0 #$t0 movzb ($sbox,$acc1,1),$acc1 #$t1 movzb ($sbox,$acc2,1),$acc2 #$t2 shl \$16,$acc0 shl \$16,$acc1 shl \$16,$acc2 xor $acc0,$t0 xor $acc1,$t1 xor $acc2,$t2 movzb `&lo("$s1")`,$acc0 movzb `&hi("$s1")`,$acc1 movzb `&hi("$s2")`,$acc2 movzb ($sbox,$acc0,1),$acc0 #$t3 movzb ($sbox,$acc1,1),$acc1 #$t0 movzb ($sbox,$acc2,1),$acc2 #$t1 shl \$16,$acc0 shl \$24,$acc1 shl \$24,$acc2 xor $acc0,$t3 xor $acc1,$t0 xor $acc2,$t1 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 mov 16+12($key),$s3 movzb ($sbox,$acc0,1),$acc0 #$t2 movzb ($sbox,$acc1,1),$acc1 #$t3 mov 16+0($key),$s0 shl \$24,$acc0 shl \$24,$acc1 xor $acc0,$t2 xor $acc1,$t3 mov 16+4($key),$s1 mov 16+8($key),$s2 lea -2048($sbox),$sbox xor $t0,$s0 xor $t1,$s1 xor $t2,$s2 xor $t3,$s3___}sub decstep(){ my ($i,@s) = @_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; $code.=" mov $s[0],$out\n" if ($i!=3); $tmp1=$s[2] if ($i==3); $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" and \$0xFF,$out\n"; $code.=" mov 0($sbox,$out,8),$out\n"; $code.=" shr \$16,$tmp1\n"; $tmp2=$s[3] if ($i==3); $code.=" mov $s[3],$tmp2\n" if ($i!=3); $tmp0=$s[1] if ($i==3); $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" and \$0xFF,$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" xor 3($sbox,$tmp0,8),$out\n"; $code.=" xor 2($sbox,$tmp1,8),$out\n"; $code.=" xor 1($sbox,$tmp2,8),$out\n"; $code.=" mov $t2,$s[1]\n" if ($i==3); $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" mov $t0,$s[3]\n" if ($i==3); $code.="\n";}sub declast(){ my ($i,@s)=@_; my $tmp0=$acc0; my $tmp1=$acc1; my $tmp2=$acc2; my $out=($t0,$t1,$t2,$s[0])[$i]; $code.=" mov $s[0],$out\n" if ($i!=3); $tmp1=$s[2] if ($i==3); $code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" and \$0xFF,$out\n"; $code.=" movzb 2048($sbox,$out,1),$out\n"; $code.=" shr \$16,$tmp1\n"; $tmp2=$s[3] if ($i==3); $code.=" mov $s[3],$tmp2\n" if ($i!=3); $tmp0=$s[1] if ($i==3); $code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" and \$0xFF,$tmp1\n"; $code.=" shr \$24,$tmp2\n"; $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n"; $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n"; $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n"; $code.=" shl \$8,$tmp0\n"; $code.=" shl \$16,$tmp1\n"; $code.=" shl \$24,$tmp2\n"; $code.=" xor $tmp0,$out\n"; $code.=" mov $t2,$s[1]\n" if ($i==3); $code.=" xor $tmp1,$out\n"; $code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" xor $tmp2,$out\n"; $code.=" mov $t0,$s[3]\n" if ($i==3); $code.="\n";}$code.=<<___;.type _x86_64_AES_decrypt,\@abi-omnipotent.align 16_x86_64_AES_decrypt: xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 mov 240($key),$rnds # load key->rounds sub \$1,$rnds jmp .Ldec_loop.align 16.Ldec_loop:___ if ($verticalspin) { &decvert(); } else { &decstep(0,$s0,$s3,$s2,$s1); &decstep(1,$s1,$s0,$s3,$s2); &decstep(2,$s2,$s1,$s0,$s3); &decstep(3,$s3,$s2,$s1,$s0); $code.=<<___; lea 16($key),$key xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3___ }$code.=<<___; sub \$1,$rnds jnz .Ldec_loop___ if ($verticalspin) { &declastvert(); } else { &declast(0,$s0,$s3,$s2,$s1); &declast(1,$s1,$s0,$s3,$s2); &declast(2,$s2,$s1,$s0,$s3); &declast(3,$s3,$s2,$s1,$s0); $code.=<<___; xor 16+0($key),$s0 # xor with key xor 16+4($key),$s1 xor 16+8($key),$s2 xor 16+12($key),$s3___ }$code.=<<___; .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt___sub deccompactvert(){ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");$code.=<<___; movzb `&lo("$s0")`,$t0 movzb `&lo("$s1")`,$t1 movzb `&lo("$s2")`,$t2 movzb ($sbox,$t0,1),$t0 movzb ($sbox,$t1,1),$t1 movzb ($sbox,$t2,1),$t2 movzb `&lo("$s3")`,$t3 movzb `&hi("$s3")`,$acc0 movzb `&hi("$s0")`,$acc1 movzb ($sbox,$t3,1),$t3 movzb ($sbox,$acc0,1),$t4 #$t0 movzb ($sbox,$acc1,1),$t5 #$t1 movzb `&hi("$s1")`,$acc2 movzb `&hi("$s2")`,$acc0 shr \$16,$s2 movzb ($sbox,$acc2,1),$acc2 #$t2 movzb ($sbox,$acc0,1),$acc0 #$t3 shr \$16,$s3 movzb `&lo("$s2")`,$acc1 shl \$8,$t4 shl \$8,$t5 movzb ($sbox,$acc1,1),$acc1 #$t0 xor $t4,$t0 xor $t5,$t1 movzb `&lo("$s3")`,$t4 shr \$16,$s0 shr \$16,$s1 movzb `&lo("$s0")`,$t5 shl \$8,$acc2 shl \$8,$acc0 movzb ($sbox,$t4,1),$t4 #$t1 movzb ($sbox,$t5,1),$t5 #$t2 xor $acc2,$t2 xor $acc0,$t3 movzb `&lo("$s1")`,$acc2 movzb `&hi("$s1")`,$acc0 shl \$16,$acc1 movzb ($sbox,$acc2,1),$acc2 #$t3 movzb ($sbox,$acc0,1),$acc0 #$t0 xor $acc1,$t0 movzb `&hi("$s2")`,$acc1 shl \$16,$t4 shl \$16,$t5 movzb ($sbox,$acc1,1),$s1 #$t1 xor $t4,$t1 xor $t5,$t2 movzb `&hi("$s3")`,$acc1 shr \$8,$s0 shl \$16,$acc2 movzb ($sbox,$acc1,1),$s2 #$t2 movzb ($sbox,$s0,1),$s3 #$t3 xor $acc2,$t3 shl \$24,$acc0 shl \$24,$s1 shl \$24,$s2 xor $acc0,$t0 shl \$24,$s3
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -