📄 aes-x86_64.pl
字号:
xor $t1,$s1 mov $t0,$s0 xor $t2,$s2 xor $t3,$s3___}# parallelized version! input is pair of 64-bit values: %rax=s1.s0# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,# %ecx=s2 and %edx=s3.sub dectransform(){ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); my $prefetch = shift;$code.=<<___; mov $tp10,$acc0 mov $tp18,$acc8 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 shr \$7,$tp40 lea ($tp10,$tp10),$tp20 shr \$7,$tp48 lea ($tp18,$tp18),$tp28 sub $tp40,$acc0 sub $tp48,$acc8 and $maskfe,$tp20 and $maskfe,$tp28 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp20,$acc0 xor $tp28,$acc8 mov $acc0,$tp20 mov $acc8,$tp28 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 shr \$7,$tp80 lea ($tp20,$tp20),$tp40 shr \$7,$tp88 lea ($tp28,$tp28),$tp48 sub $tp80,$acc0 sub $tp88,$acc8 and $maskfe,$tp40 and $maskfe,$tp48 and $mask1b,$acc0 and $mask1b,$acc8 xor $tp40,$acc0 xor $tp48,$acc8 mov $acc0,$tp40 mov $acc8,$tp48 and $mask80,$acc0 and $mask80,$acc8 mov $acc0,$tp80 mov $acc8,$tp88 shr \$7,$tp80 xor $tp10,$tp20 # tp2^=tp1 shr \$7,$tp88 xor $tp18,$tp28 # tp2^=tp1 sub $tp80,$acc0 sub $tp88,$acc8 lea ($tp40,$tp40),$tp80 lea ($tp48,$tp48),$tp88 xor $tp10,$tp40 # tp4^=tp1 xor $tp18,$tp48 # tp4^=tp1 and $maskfe,$tp80 and $maskfe,$tp88 and $mask1b,$acc0 and $mask1b,$acc8 xor $acc0,$tp80 xor $acc8,$tp88 xor $tp80,$tp10 # tp1^=tp8 xor $tp88,$tp18 # tp1^=tp8 xor $tp80,$tp20 # tp2^tp1^=tp8 xor $tp88,$tp28 # tp2^tp1^=tp8 mov $tp10,$acc0 mov $tp18,$acc8 xor $tp80,$tp40 # tp4^tp1^=tp8 xor $tp88,$tp48 # tp4^tp1^=tp8 shr \$32,$acc0 shr \$32,$acc8 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) xor `&LO("$tp80")`,`&LO("$tp10")` xor `&LO("$tp88")`,`&LO("$tp18")` shr \$32,$tp80 shr \$32,$tp88 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` mov $tp20,$tp80 mov $tp28,$tp88 shr \$32,$tp80 shr \$32,$tp88 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) xor `&LO("$tp20")`,`&LO("$tp10")` xor `&LO("$tp28")`,`&LO("$tp18")` mov $tp40,$tp20 mov $tp48,$tp28 xor `&LO("$tp80")`,`&LO("$acc0")` xor `&LO("$tp88")`,`&LO("$acc8")` `"mov 0($sbox),$mask80" if ($prefetch)` shr \$32,$tp20 shr \$32,$tp28 `"mov 64($sbox),$maskfe" if ($prefetch)` rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) `"mov 128($sbox),$mask1b" if ($prefetch)` rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) `"mov 192($sbox),$tp80" if ($prefetch)` xor `&LO("$tp40")`,`&LO("$tp10")` xor `&LO("$tp48")`,`&LO("$tp18")` `"mov 256($sbox),$tp88" if ($prefetch)` xor `&LO("$tp20")`,`&LO("$acc0")` xor `&LO("$tp28")`,`&LO("$acc8")`___}$code.=<<___;.type _x86_64_AES_decrypt_compact,\@abi-omnipotent.align 16_x86_64_AES_decrypt_compact: lea 128($sbox),$inp # size optimization mov 0-128($inp),$acc1 # prefetch Td4 mov 32-128($inp),$acc2 mov 64-128($inp),$t0 mov 96-128($inp),$t1 mov 128-128($inp),$acc1 mov 160-128($inp),$acc2 mov 192-128($inp),$t0 mov 224-128($inp),$t1 jmp .Ldec_loop_compact.align 16.Ldec_loop_compact: xor 0($key),$s0 # xor with key xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 lea 16($key),$key___ &deccompactvert();$code.=<<___; cmp 16(%rsp),$key je .Ldec_compact_done mov 256+0($sbox),$mask80 shl \$32,%rbx shl \$32,%rdx mov 256+8($sbox),$maskfe or %rbx,%rax or %rdx,%rcx mov 256+16($sbox),$mask1b___ &dectransform(1);$code.=<<___; jmp .Ldec_loop_compact.align 16.Ldec_compact_done: xor 0($key),$s0 xor 4($key),$s1 xor 8($key),$s2 xor 12($key),$s3 .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact___# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);$code.=<<___;.globl AES_decrypt.type AES_decrypt,\@function,3.align 16AES_decrypt: push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 # allocate frame "above" key schedule mov %rsp,%rax mov %rdx,$key lea -63(%rdx),%rcx and \$-64,%rsp sub %rsp,%rcx neg %rcx and \$0x3c0,%rcx sub %rcx,%rsp push %rax # save real stack pointer push %rsi # save out mov 240($key),$rnds # load rounds mov 0(%rdi),$s0 # load input vector mov 4(%rdi),$s1 mov 8(%rdi),$s2 mov 12(%rdi),$s3 shl \$4,$rnds lea ($key,$rnds),%rbp push %rbp push $key # pick Td4 copy which can't "overlap" with stack frame or key schedule .picmeup $sbox lea AES_Td+2048-.($sbox),$sbox lea 768(%rsp),%rbp sub $sbox,%rbp and \$0x300,%rbp lea ($sbox,%rbp),$sbox shr \$3,%rbp # recall "magic" constants! add %rbp,$sbox call _x86_64_AES_decrypt_compact mov 16(%rsp),$out # restore out mov 24(%rsp),%rsp mov $s0,0($out) # write output vector mov $s1,4($out) mov $s2,8($out) mov $s3,12($out) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx ret.size AES_decrypt,.-AES_decrypt___#------------------------------------------------------------------#sub enckey(){$code.=<<___; movz %dl,%esi # rk[i]>>0 movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[i]>>8 shl \$24,%ebx xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx shr \$16,%edx movz %dl,%esi # rk[i]>>16 xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[i]>>24 shl \$8,%ebx xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx shl \$16,%ebx xor %ebx,%eax xor 1024-128(%rbp,%rcx,4),%eax # rcon___}# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,# AES_KEY *key)$code.=<<___;.globl AES_set_encrypt_key.type AES_set_encrypt_key,\@function,3.align 16AES_set_encrypt_key: call _x86_64_AES_set_encrypt_key ret.size AES_set_encrypt_key,.-AES_set_encrypt_key.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent.align 16_x86_64_AES_set_encrypt_key: push %rbx push %rbp mov %esi,%ecx # %ecx=bits mov %rdi,%rsi # %rsi=userKey mov %rdx,%rdi # %rdi=key test \$-1,%rsi jz .Lbadpointer test \$-1,%rdi jz .Lbadpointer .picmeup %rbp lea AES_Te-.(%rbp),%rbp lea 2048+128(%rbp),%rbp # prefetch Te4 mov 0-128(%rbp),%eax mov 32-128(%rbp),%ebx mov 64-128(%rbp),%r8d mov 96-128(%rbp),%edx mov 128-128(%rbp),%eax mov 160-128(%rbp),%ebx mov 192-128(%rbp),%r8d mov 224-128(%rbp),%edx cmp \$128,%ecx je .L10rounds cmp \$192,%ecx je .L12rounds cmp \$256,%ecx je .L14rounds mov \$-2,%rax # invalid number of bits jmp .Lexit.L10rounds: mov 0(%rsi),%rax # copy first 4 dwords mov 8(%rsi),%rdx mov %rax,0(%rdi) mov %rdx,8(%rdi) shr \$32,%rdx xor %ecx,%ecx jmp .L10shortcut.align 4.L10loop: mov 0(%rdi),%eax # rk[0] mov 12(%rdi),%edx # rk[3].L10shortcut:___ &enckey ();$code.=<<___; mov %eax,16(%rdi) # rk[4] xor 4(%rdi),%eax mov %eax,20(%rdi) # rk[5] xor 8(%rdi),%eax mov %eax,24(%rdi) # rk[6] xor 12(%rdi),%eax mov %eax,28(%rdi) # rk[7] add \$1,%ecx lea 16(%rdi),%rdi cmp \$10,%ecx jl .L10loop movl \$10,80(%rdi) # setup number of rounds xor %rax,%rax jmp .Lexit.L12rounds: mov 0(%rsi),%rax # copy first 6 dwords mov 8(%rsi),%rbx mov 16(%rsi),%rdx mov %rax,0(%rdi) mov %rbx,8(%rdi) mov %rdx,16(%rdi) shr \$32,%rdx xor %ecx,%ecx jmp .L12shortcut.align 4.L12loop: mov 0(%rdi),%eax # rk[0] mov 20(%rdi),%edx # rk[5].L12shortcut:___ &enckey ();$code.=<<___; mov %eax,24(%rdi) # rk[6] xor 4(%rdi),%eax mov %eax,28(%rdi) # rk[7] xor 8(%rdi),%eax mov %eax,32(%rdi) # rk[8] xor 12(%rdi),%eax mov %eax,36(%rdi) # rk[9] cmp \$7,%ecx je .L12break add \$1,%ecx xor 16(%rdi),%eax mov %eax,40(%rdi) # rk[10] xor 20(%rdi),%eax mov %eax,44(%rdi) # rk[11] lea 24(%rdi),%rdi jmp .L12loop.L12break: movl \$12,72(%rdi) # setup number of rounds xor %rax,%rax jmp .Lexit.L14rounds: mov 0(%rsi),%rax # copy first 8 dwords mov 8(%rsi),%rbx mov 16(%rsi),%rcx mov 24(%rsi),%rdx mov %rax,0(%rdi) mov %rbx,8(%rdi) mov %rcx,16(%rdi) mov %rdx,24(%rdi) shr \$32,%rdx xor %ecx,%ecx jmp .L14shortcut.align 4.L14loop: mov 0(%rdi),%eax # rk[0] mov 28(%rdi),%edx # rk[4].L14shortcut:___ &enckey ();$code.=<<___; mov %eax,32(%rdi) # rk[8] xor 4(%rdi),%eax mov %eax,36(%rdi) # rk[9] xor 8(%rdi),%eax mov %eax,40(%rdi) # rk[10] xor 12(%rdi),%eax mov %eax,44(%rdi) # rk[11] cmp \$6,%ecx je .L14break add \$1,%ecx mov %eax,%edx mov 16(%rdi),%eax # rk[4] movz %dl,%esi # rk[11]>>0 movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[11]>>8 xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx shr \$16,%edx shl \$8,%ebx movz %dl,%esi # rk[11]>>16 xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx movz %dh,%esi # rk[11]>>24 shl \$16,%ebx xor %ebx,%eax movzb -128(%rbp,%rsi),%ebx shl \$24,%ebx xor %ebx,%eax mov %eax,48(%rdi) # rk[12] xor 20(%rdi),%eax mov %eax,52(%rdi) # rk[13] xor 24(%rdi),%eax mov %eax,56(%rdi) # rk[14] xor 28(%rdi),%eax mov %eax,60(%rdi) # rk[15] lea 32(%rdi),%rdi jmp .L14loop.L14break: movl \$14,48(%rdi) # setup number of rounds xor %rax,%rax jmp .Lexit.Lbadpointer: mov \$-1,%rax.Lexit: pop %rbp pop %rbx .byte 0xf3,0xc3 # rep ret.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key___sub deckey_ref(){ my ($i,$ptr,$te,$td) = @_; my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");$code.=<<___; mov $i($ptr),$tp1 mov $tp1,$acc and \$0x80808080,$acc mov $acc,$tp4 shr \$7,$tp4 lea 0($tp1,$tp1),$tp2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -