📄 aes-x86_64.pl

📁 开放的SSL工具
💻 PL
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
#!/usr/bin/env perl## ====================================================================# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL# project. The module is, however, dual licensed under OpenSSL and# CRYPTOGAMS licenses depending on where you obtain it. For further# details see http://www.openssl.org/~appro/cryptogams/.# ====================================================================## Version 2.1.## aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version# [you'll notice a lot of resemblance], such as compressed S-boxes# in little-endian byte order, prefetch of these tables in CBC mode,# as well as avoiding L1 cache aliasing between stack frame and key# schedule and already mentioned tables, compressed Td4...## Performance in number of cycles per processed byte for 128-bit key:##		ECB encrypt	ECB decrypt	CBC large chunk# AMD64		33		41		13.0# EM64T		38		59		18.6(*)# Core 2	30		43		14.5(*)## (*) with hyper-threading off$verticalspin=1;	# unlike 32-bit version $verticalspin performs			# ~15% better on both AMD and Intel cores$speed_limit=512;	# see aes-586.pl for details$output=shift;$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) ordie "can't locate x86_64-xlate.pl";open STDOUT,"| $^X $xlate $output";$code=".text\n";$s0="%eax";$s1="%ebx";$s2="%ecx";$s3="%edx";$acc0="%esi";	$mask80="%rsi";$acc1="%edi";	$maskfe="%rdi";$acc2="%ebp";	$mask1b="%rbp";$inp="%r8";$out="%r9";$t0="%r10d";$t1="%r11d";$t2="%r12d";$rnds="%r13d";$sbox="%r14";$key="%r15";sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;			$r =~ s/%[er]([sd]i)/%\1l/;			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }sub _data_word(){ my $i;    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }}sub data_word(){ my $i;  my $last=pop(@_);    $code.=".long\t";    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }    $code.=sprintf"0x%08x\n",$last;}sub data_byte(){ my $i;  my $last=pop(@_);    $code.=".byte\t";    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }    $code.=sprintf"0x%02x\n",$last&0xff;}sub encvert(){ my $t3="%r8d";	# zaps $inp!$code.=<<___;	# favor 3-way issue Opteron pipeline...	movzb	`&lo("$s0")`,$acc0	movzb	`&lo("$s1")`,$acc1	movzb	`&lo("$s2")`,$acc2	mov	0($sbox,$acc0,8),$t0	mov	0($sbox,$acc1,8),$t1	mov	0($sbox,$acc2,8),$t2	movzb	`&hi("$s1")`,$acc0	movzb	`&hi("$s2")`,$acc1	movzb	`&lo("$s3")`,$acc2	xor	3($sbox,$acc0,8),$t0	xor	3($sbox,$acc1,8),$t1	mov	0($sbox,$acc2,8),$t3	movzb	`&hi("$s3")`,$acc0	shr	\$16,$s2	movzb	`&hi("$s0")`,$acc2	xor	3($sbox,$acc0,8),$t2	shr	\$16,$s3	xor	3($sbox,$acc2,8),$t3	shr	\$16,$s1	lea	16($key),$key	shr	\$16,$s0	movzb	`&lo("$s2")`,$acc0	movzb	`&lo("$s3")`,$acc1	movzb	`&lo("$s0")`,$acc2	xor	2($sbox,$acc0,8),$t0	xor	2($sbox,$acc1,8),$t1	xor	2($sbox,$acc2,8),$t2	movzb	`&hi("$s3")`,$acc0	movzb	`&hi("$s0")`,$acc1	movzb	`&lo("$s1")`,$acc2	xor	1($sbox,$acc0,8),$t0	xor	1($sbox,$acc1,8),$t1	xor	2($sbox,$acc2,8),$t3	mov	12($key),$s3	movzb	`&hi("$s1")`,$acc1	movzb	`&hi("$s2")`,$acc2	mov	0($key),$s0	xor	1($sbox,$acc1,8),$t2	xor	1($sbox,$acc2,8),$t3	mov	4($key),$s1	mov	8($key),$s2	xor	$t0,$s0	xor	$t1,$s1	xor	$t2,$s2	xor	$t3,$s3___}sub enclastvert(){ my $t3="%r8d";	# zaps $inp!$code.=<<___;	movzb	`&lo("$s0")`,$acc0	movzb	`&lo("$s1")`,$acc1	movzb	`&lo("$s2")`,$acc2	movzb	2($sbox,$acc0,8),$t0	movzb	2($sbox,$acc1,8),$t1	movzb	2($sbox,$acc2,8),$t2	movzb	`&lo("$s3")`,$acc0	movzb	`&hi("$s1")`,$acc1	movzb	`&hi("$s2")`,$acc2	movzb	2($sbox,$acc0,8),$t3	mov	0($sbox,$acc1,8),$acc1	#$t0	mov	0($sbox,$acc2,8),$acc2	#$t1	and	\$0x0000ff00,$acc1	and	\$0x0000ff00,$acc2	xor	$acc1,$t0	xor	$acc2,$t1	shr	\$16,$s2	movzb	`&hi("$s3")`,$acc0	movzb	`&hi("$s0")`,$acc1	shr	\$16,$s3	mov	0($sbox,$acc0,8),$acc0	#$t2	mov	0($sbox,$acc1,8),$acc1	#$t3	and	\$0x0000ff00,$acc0	and	\$0x0000ff00,$acc1	shr	\$16,$s1	xor	$acc0,$t2	xor	$acc1,$t3	shr	\$16,$s0	movzb	`&lo("$s2")`,$acc0	movzb	`&lo("$s3")`,$acc1	movzb	`&lo("$s0")`,$acc2	mov	0($sbox,$acc0,8),$acc0	#$t0	mov	0($sbox,$acc1,8),$acc1	#$t1	mov	0($sbox,$acc2,8),$acc2	#$t2	and	\$0x00ff0000,$acc0	and	\$0x00ff0000,$acc1	and	\$0x00ff0000,$acc2	xor	$acc0,$t0	xor	$acc1,$t1	xor	$acc2,$t2	movzb	`&lo("$s1")`,$acc0	movzb	`&hi("$s3")`,$acc1	movzb	`&hi("$s0")`,$acc2	mov	0($sbox,$acc0,8),$acc0	#$t3	mov	2($sbox,$acc1,8),$acc1	#$t0	mov	2($sbox,$acc2,8),$acc2	#$t1	and	\$0x00ff0000,$acc0	and	\$0xff000000,$acc1	and	\$0xff000000,$acc2	xor	$acc0,$t3	xor	$acc1,$t0	xor	$acc2,$t1	movzb	`&hi("$s1")`,$acc0	movzb	`&hi("$s2")`,$acc1	mov	16+12($key),$s3	mov	2($sbox,$acc0,8),$acc0	#$t2	mov	2($sbox,$acc1,8),$acc1	#$t3	mov	16+0($key),$s0	and	\$0xff000000,$acc0	and	\$0xff000000,$acc1	xor	$acc0,$t2	xor	$acc1,$t3	mov	16+4($key),$s1	mov	16+8($key),$s2	xor	$t0,$s0	xor	$t1,$s1	xor	$t2,$s2	xor	$t3,$s3___}sub encstep(){ my ($i,@s) = @_;  my $tmp0=$acc0;  my $tmp1=$acc1;  my $tmp2=$acc2;  my $out=($t0,$t1,$t2,$s[0])[$i];	if ($i==3) {		$tmp0=$s[1];		$tmp1=$s[2];		$tmp2=$s[3];	}	$code.="	movzb	".&lo($s[0]).",$out\n";	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);	$code.="	lea	16($key),$key\n"	if ($i==0);	$code.="	movzb	".&hi($s[1]).",$tmp0\n";	$code.="	mov	0($sbox,$out,8),$out\n";	$code.="	shr	\$16,$tmp1\n";	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);	$code.="	xor	3($sbox,$tmp0,8),$out\n";	$code.="	movzb	".&lo($tmp1).",$tmp1\n";	$code.="	shr	\$24,$tmp2\n";	$code.="	xor	4*$i($key),$out\n";	$code.="	xor	2($sbox,$tmp1,8),$out\n";	$code.="	xor	1($sbox,$tmp2,8),$out\n";	$code.="	mov	$t0,$s[1]\n"		if ($i==3);	$code.="	mov	$t1,$s[2]\n"		if ($i==3);	$code.="	mov	$t2,$s[3]\n"		if ($i==3);	$code.="\n";}sub enclast(){ my ($i,@s)=@_;  my $tmp0=$acc0;  my $tmp1=$acc1;  my $tmp2=$acc2;  my $out=($t0,$t1,$t2,$s[0])[$i];	if ($i==3) {		$tmp0=$s[1];		$tmp1=$s[2];		$tmp2=$s[3];	}	$code.="	movzb	".&lo($s[0]).",$out\n";	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);	$code.="	mov	2($sbox,$out,8),$out\n";	$code.="	shr	\$16,$tmp1\n";	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);	$code.="	and	\$0x000000ff,$out\n";	$code.="	movzb	".&hi($s[1]).",$tmp0\n";	$code.="	movzb	".&lo($tmp1).",$tmp1\n";	$code.="	shr	\$24,$tmp2\n";	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";	$code.="	and	\$0x0000ff00,$tmp0\n";	$code.="	and	\$0x00ff0000,$tmp1\n";	$code.="	and	\$0xff000000,$tmp2\n";	$code.="	xor	$tmp0,$out\n";	$code.="	mov	$t0,$s[1]\n"		if ($i==3);	$code.="	xor	$tmp1,$out\n";	$code.="	mov	$t1,$s[2]\n"		if ($i==3);	$code.="	xor	$tmp2,$out\n";	$code.="	mov	$t2,$s[3]\n"		if ($i==3);	$code.="\n";}$code.=<<___;.type	_x86_64_AES_encrypt,\@abi-omnipotent.align	16_x86_64_AES_encrypt:	xor	0($key),$s0			# xor with key	xor	4($key),$s1	xor	8($key),$s2	xor	12($key),$s3	mov	240($key),$rnds			# load key->rounds	sub	\$1,$rnds	jmp	.Lenc_loop.align	16.Lenc_loop:___	if ($verticalspin) { &encvert(); }	else {	&encstep(0,$s0,$s1,$s2,$s3);		&encstep(1,$s1,$s2,$s3,$s0);		&encstep(2,$s2,$s3,$s0,$s1);		&encstep(3,$s3,$s0,$s1,$s2);	}$code.=<<___;	sub	\$1,$rnds	jnz	.Lenc_loop___	if ($verticalspin) { &enclastvert(); }	else {	&enclast(0,$s0,$s1,$s2,$s3);		&enclast(1,$s1,$s2,$s3,$s0);		&enclast(2,$s2,$s3,$s0,$s1);		&enclast(3,$s3,$s0,$s1,$s2);		$code.=<<___;		xor	16+0($key),$s0		# xor with key		xor	16+4($key),$s1		xor	16+8($key),$s2		xor	16+12($key),$s3___	}$code.=<<___;	.byte	0xf3,0xc3			# rep ret.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt___# it's possible to implement this by shifting tN by 8, filling least# significant byte with byte load and finally bswap-ing at the end,# but such partial register load kills Core 2...sub enccompactvert(){ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");$code.=<<___;	movzb	`&lo("$s0")`,$t0	movzb	`&lo("$s1")`,$t1	movzb	`&lo("$s2")`,$t2	movzb	($sbox,$t0,1),$t0	movzb	($sbox,$t1,1),$t1	movzb	($sbox,$t2,1),$t2	movzb	`&lo("$s3")`,$t3	movzb	`&hi("$s1")`,$acc0	movzb	`&hi("$s2")`,$acc1	movzb	($sbox,$t3,1),$t3	movzb	($sbox,$acc0,1),$t4	#$t0	movzb	($sbox,$acc1,1),$t5	#$t1	movzb	`&hi("$s3")`,$acc2	movzb	`&hi("$s0")`,$acc0	shr	\$16,$s2	movzb	($sbox,$acc2,1),$acc2	#$t2	movzb	($sbox,$acc0,1),$acc0	#$t3	shr	\$16,$s3	movzb	`&lo("$s2")`,$acc1	shl	\$8,$t4	shl	\$8,$t5	movzb	($sbox,$acc1,1),$acc1	#$t0	xor	$t4,$t0	xor	$t5,$t1	movzb	`&lo("$s3")`,$t4	shr	\$16,$s0	shr	\$16,$s1	movzb	`&lo("$s0")`,$t5	shl	\$8,$acc2	shl	\$8,$acc0	movzb	($sbox,$t4,1),$t4	#$t1	movzb	($sbox,$t5,1),$t5	#$t2	xor	$acc2,$t2	xor	$acc0,$t3	movzb	`&lo("$s1")`,$acc2	movzb	`&hi("$s3")`,$acc0	shl	\$16,$acc1	movzb	($sbox,$acc2,1),$acc2	#$t3	movzb	($sbox,$acc0,1),$acc0	#$t0	xor	$acc1,$t0	movzb	`&hi("$s0")`,$acc1	shr	\$8,$s2	shr	\$8,$s1	movzb	($sbox,$acc1,1),$acc1	#$t1	movzb	($sbox,$s2,1),$s3	#$t3	movzb	($sbox,$s1,1),$s2	#$t2	shl	\$16,$t4	shl	\$16,$t5	shl	\$16,$acc2	xor	$t4,$t1	xor	$t5,$t2	xor	$acc2,$t3	shl	\$24,$acc0	shl	\$24,$acc1	shl	\$24,$s3	xor	$acc0,$t0	shl	\$24,$s2	xor	$acc1,$t1	mov	$t0,$s0	mov	$t1,$s1	xor	$t2,$s2	xor	$t3,$s3___}sub enctransform_ref(){ my $sn = shift;  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");$code.=<<___;	mov	$sn,$acc	and	\$0x80808080,$acc	mov	$acc,$tmp	shr	\$7,$tmp	lea	($sn,$sn),$r2	sub	$tmp,$acc	and	\$0xfefefefe,$r2	and	\$0x1b1b1b1b,$acc	mov	$sn,$tmp	xor	$acc,$r2	xor	$r2,$sn	rol	\$24,$sn	xor	$r2,$sn	ror	\$16,$tmp	xor	$tmp,$sn	ror	\$8,$tmp	xor	$tmp,$sn___}# unlike decrypt case it does not pay off to parallelize enctransformsub enctransform(){ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");$code.=<<___;	mov	$s0,$acc0	mov	$s1,$acc1	and	\$0x80808080,$acc0	and	\$0x80808080,$acc1	mov	$acc0,$t0	mov	$acc1,$t1	shr	\$7,$t0	lea	($s0,$s0),$r20	shr	\$7,$t1	lea	($s1,$s1),$r21	sub	$t0,$acc0	sub	$t1,$acc1	and	\$0xfefefefe,$r20	and	\$0xfefefefe,$r21	and	\$0x1b1b1b1b,$acc0	and	\$0x1b1b1b1b,$acc1	mov	$s0,$t0	mov	$s1,$t1	xor	$acc0,$r20	xor	$acc1,$r21	xor	$r20,$s0	xor	$r21,$s1	 mov	$s2,$acc0	 mov	$s3,$acc1	rol	\$24,$s0	rol	\$24,$s1	 and	\$0x80808080,$acc0	 and	\$0x80808080,$acc1	xor	$r20,$s0	xor	$r21,$s1	 mov	$acc0,$t2
12 3 4 5 下一页
💿 文件大小 3681 K
👤 上传用户 xof1234
📂 所属分类 Internet/网络编程
🏷️ 相关标签

#SSL
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -