⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cmll-x86_64.pl

📁 著名的开源密码源代码
💻 PL
📖 第 1 页 / 共 2 页
字号:
#!/usr/bin/env perl# ====================================================================# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>## This module may be used under the terms of either the GNU General# Public License version 2 or later, the GNU Lesser General Public# License version 2.1 or later, the Mozilla Public License version# 1.1 or the BSD License. The exact terms of either license are# distributed along with this module. For further details see# http://www.openssl.org/~appro/camellia/.# ====================================================================# Performance in cycles per processed byte (less is better) in# 'openssl speed ...' benchmark:##			AMD64	Core2	EM64T# -evp camellia-128-ecb	16.7	21.0	22.7# + over gcc 3.4.6	+25%	+5%	0%## camellia-128-cbc	15.7	20.4	21.1## 128-bit key setup	128	216	205	cycles/key# + over gcc 3.4.6	+54%	+39%	+15%## Numbers in "+" rows represent performance improvement over compiler# generated code. Key setup timings are impressive on AMD and Core2# thanks to 64-bit operations being covertly deployed. Improvement on# EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it# apparently emulates some of 64-bit operations in [32-bit] microcode.$flavour = shift;$output  = shift;if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) ordie "can't locate x86_64-xlate.pl";open STDOUT,"| $^X $xlate $flavour $output";sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;                        $r =~ s/%[er]([sd]i)/%\1l/;                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }$t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";@S=("%r8d","%r9d","%r10d","%r11d");$i0="%esi";$i1="%edi";$Tbl="%rbp";	# size optimization$inp="%r12";$out="%r13";$key="%r14";$keyend="%r15";$arg0d=$win64?"%ecx":"%edi";# const unsigned int Camellia_SBOX[4][256];# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],# and [2][] - with [3][]. This is done to minimize code size.$SBOX1_1110=0;		# Camellia_SBOX[0]$SBOX4_4404=4;		# Camellia_SBOX[1]$SBOX2_0222=2048;	# Camellia_SBOX[2]$SBOX3_3033=2052;	# Camellia_SBOX[3]sub Camellia_Feistel {my $i=@_[0];my $seed=defined(@_[1])?@_[1]:0;my $scale=$seed<0?-8:8;my $j=($i&1)*2;my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];$code.=<<___;	xor	$s0,$t0				# t0^=key[0]	xor	$s1,$t1				# t1^=key[1]	movz	`&hi("$t0")`,$i0		# (t0>>8)&0xff	movz	`&lo("$t1")`,$i1		# (t1>>0)&0xff	mov	$SBOX3_3033($Tbl,$i0,8),$t3	# t3=SBOX3_3033[0]	mov	$SBOX1_1110($Tbl,$i1,8),$t2	# t2=SBOX1_1110[1]	movz	`&lo("$t0")`,$i0		# (t0>>0)&0xff	shr	\$16,$t0	movz	`&hi("$t1")`,$i1		# (t1>>8)&0xff	xor	$SBOX4_4404($Tbl,$i0,8),$t3	# t3^=SBOX4_4404[0]	shr	\$16,$t1	xor	$SBOX4_4404($Tbl,$i1,8),$t2	# t2^=SBOX4_4404[1]	movz	`&hi("$t0")`,$i0		# (t0>>24)&0xff	movz	`&lo("$t1")`,$i1		# (t1>>16)&0xff	xor	$SBOX1_1110($Tbl,$i0,8),$t3	# t3^=SBOX1_1110[0]	xor	$SBOX3_3033($Tbl,$i1,8),$t2	# t2^=SBOX3_3033[1]	movz	`&lo("$t0")`,$i0		# (t0>>16)&0xff	movz	`&hi("$t1")`,$i1		# (t1>>24)&0xff	xor	$SBOX2_0222($Tbl,$i0,8),$t3	# t3^=SBOX2_0222[0]	xor	$SBOX2_0222($Tbl,$i1,8),$t2	# t2^=SBOX2_0222[1]	mov	`$seed+($i+1)*$scale`($key),$t1	# prefetch key[i+1]	mov	`$seed+($i+1)*$scale+4`($key),$t0	xor	$t3,$t2				# t2^=t3	ror	\$8,$t3				# t3=RightRotate(t3,8)	xor	$t2,$s2	xor	$t2,$s3	xor	$t3,$s3___}# void Camellia_EncryptBlock_Rounds(#		int grandRounds,#		const Byte plaintext[],#		const KEY_TABLE_TYPE keyTable,#		Byte ciphertext[])$code=<<___;.text# V1.x API.globl	Camellia_EncryptBlock.type	Camellia_EncryptBlock,\@abi-omnipotent.align	16Camellia_EncryptBlock:	movl	\$128,%eax	subl	$arg0d,%eax	movl	\$3,$arg0d	adcl	\$0,$arg0d	# keyBitLength==128?3:4	jmp	.Lenc_rounds.size	Camellia_EncryptBlock,.-Camellia_EncryptBlock# V2.globl	Camellia_EncryptBlock_Rounds.type	Camellia_EncryptBlock_Rounds,\@function,4.align	16.Lenc_rounds:Camellia_EncryptBlock_Rounds:	push	%rbx	push	%rbp	push	%r13	push	%r14	push	%r15.Lenc_prologue:	#mov	%rsi,$inp		# put away arguments	mov	%rcx,$out	mov	%rdx,$key	shl	\$6,%edi		# process grandRounds	lea	.LCamellia_SBOX(%rip),$Tbl	lea	($key,%rdi),$keyend	mov	0(%rsi),@S[0]		# load plaintext	mov	4(%rsi),@S[1]	mov	8(%rsi),@S[2]	bswap	@S[0]	mov	12(%rsi),@S[3]	bswap	@S[1]	bswap	@S[2]	bswap	@S[3]	call	_x86_64_Camellia_encrypt	bswap	@S[0]	bswap	@S[1]	bswap	@S[2]	mov	@S[0],0($out)	bswap	@S[3]	mov	@S[1],4($out)	mov	@S[2],8($out)	mov	@S[3],12($out)	mov	0(%rsp),%r15	mov	8(%rsp),%r14	mov	16(%rsp),%r13	mov	24(%rsp),%rbp	mov	32(%rsp),%rbx	lea	40(%rsp),%rsp.Lenc_epilogue:	ret.size	Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds.type	_x86_64_Camellia_encrypt,\@abi-omnipotent.align	16_x86_64_Camellia_encrypt:	xor	0($key),@S[1]	xor	4($key),@S[0]		# ^=key[0-3]	xor	8($key),@S[3]	xor	12($key),@S[2].align	16.Leloop:	mov	16($key),$t1		# prefetch key[4-5]	mov	20($key),$t0___	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }$code.=<<___;	lea	16*4($key),$key	cmp	$keyend,$key	mov	8($key),$t3		# prefetch key[2-3]	mov	12($key),$t2	je	.Ledone	and	@S[0],$t0	or	@S[3],$t3	rol	\$1,$t0	xor	$t3,@S[2]		# s2^=s3|key[3];	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);	and	@S[2],$t2	or	@S[1],$t1	rol	\$1,$t2	xor	$t1,@S[0]		# s0^=s1|key[1];	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);	jmp	.Leloop.align	16.Ledone:	xor	@S[2],$t0		# SwapHalf	xor	@S[3],$t1	xor	@S[0],$t2	xor	@S[1],$t3	mov	$t0,@S[0]	mov	$t1,@S[1]	mov	$t2,@S[2]	mov	$t3,@S[3]	.byte	0xf3,0xc3		# rep ret.size	_x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt# V1.x API.globl	Camellia_DecryptBlock.type	Camellia_DecryptBlock,\@abi-omnipotent.align	16Camellia_DecryptBlock:	movl	\$128,%eax	subl	$arg0d,%eax	movl	\$3,$arg0d	adcl	\$0,$arg0d	# keyBitLength==128?3:4	jmp	.Ldec_rounds.size	Camellia_DecryptBlock,.-Camellia_DecryptBlock# V2.globl	Camellia_DecryptBlock_Rounds.type	Camellia_DecryptBlock_Rounds,\@function,4.align	16.Ldec_rounds:Camellia_DecryptBlock_Rounds:	push	%rbx	push	%rbp	push	%r13	push	%r14	push	%r15.Ldec_prologue:	#mov	%rsi,$inp		# put away arguments	mov	%rcx,$out	mov	%rdx,$keyend	shl	\$6,%edi		# process grandRounds	lea	.LCamellia_SBOX(%rip),$Tbl	lea	($keyend,%rdi),$key	mov	0(%rsi),@S[0]		# load plaintext	mov	4(%rsi),@S[1]	mov	8(%rsi),@S[2]	bswap	@S[0]	mov	12(%rsi),@S[3]	bswap	@S[1]	bswap	@S[2]	bswap	@S[3]	call	_x86_64_Camellia_decrypt	bswap	@S[0]	bswap	@S[1]	bswap	@S[2]	mov	@S[0],0($out)	bswap	@S[3]	mov	@S[1],4($out)	mov	@S[2],8($out)	mov	@S[3],12($out)	mov	0(%rsp),%r15	mov	8(%rsp),%r14	mov	16(%rsp),%r13	mov	24(%rsp),%rbp	mov	32(%rsp),%rbx	lea	40(%rsp),%rsp.Ldec_epilogue:	ret.size	Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds.type	_x86_64_Camellia_decrypt,\@abi-omnipotent.align	16_x86_64_Camellia_decrypt:	xor	0($key),@S[1]	xor	4($key),@S[0]		# ^=key[0-3]	xor	8($key),@S[3]	xor	12($key),@S[2].align	16.Ldloop:	mov	-8($key),$t1		# prefetch key[4-5]	mov	-4($key),$t0___	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }$code.=<<___;	lea	-16*4($key),$key	cmp	$keyend,$key	mov	0($key),$t3		# prefetch key[2-3]	mov	4($key),$t2	je	.Lddone	and	@S[0],$t0	or	@S[3],$t3	rol	\$1,$t0	xor	$t3,@S[2]		# s2^=s3|key[3];	xor	$t0,@S[1]		# s1^=LeftRotate(s0&key[0],1);	and	@S[2],$t2	or	@S[1],$t1	rol	\$1,$t2	xor	$t1,@S[0]		# s0^=s1|key[1];	xor	$t2,@S[3]		# s3^=LeftRotate(s2&key[2],1);	jmp	.Ldloop.align	16.Lddone:	xor	@S[2],$t2	xor	@S[3],$t3	xor	@S[0],$t0	xor	@S[1],$t1	mov	$t2,@S[0]		# SwapHalf	mov	$t3,@S[1]	mov	$t0,@S[2]	mov	$t1,@S[3]	.byte	0xf3,0xc3		# rep ret.size	_x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt___sub _saveround {my ($rnd,$key,@T)=@_;my $bias=int(@T[0])?shift(@T):0;    if ($#T==3) {	$code.=<<___;	mov	@T[1],`$bias+$rnd*8+0`($key)	mov	@T[0],`$bias+$rnd*8+4`($key)	mov	@T[3],`$bias+$rnd*8+8`($key)	mov	@T[2],`$bias+$rnd*8+12`($key)___    } else {	$code.="	mov	@T[0],`$bias+$rnd*8+0`($key)\n";	$code.="	mov	@T[1],`$bias+$rnd*8+8`($key)\n"	if ($#T>=1);    }}sub _loadround {my ($rnd,$key,@T)=@_;my $bias=int(@T[0])?shift(@T):0;$code.="	mov	`$bias+$rnd*8+0`($key),@T[0]\n";$code.="	mov	`$bias+$rnd*8+8`($key),@T[1]\n"	if ($#T>=1);}# shld is very slow on Intel EM64T family. Even on AMD it limits# instruction decode rate [because it's VectorPath] and consequently# performance...sub __rotl128 {my ($i0,$i1,$rot)=@_;    if ($rot) {	$code.=<<___;	mov	$i0,%r11	shld	\$$rot,$i1,$i0	shld	\$$rot,%r11,$i1___    }}# ... Implementing 128-bit rotate without shld gives 80% better# performance EM64T, +15% on AMD64 and only ~7% degradation on# Core2. This is therefore preferred.sub _rotl128 {my ($i0,$i1,$rot)=@_;    if ($rot) {	$code.=<<___;	mov	$i0,%r11	shl	\$$rot,$i0	mov	$i1,%r9	shr	\$`64-$rot`,%r9	shr	\$`64-$rot`,%r11	or	%r9,$i0	shl	\$$rot,$i1	or	%r11,$i1___    }}{ my $step=0;$code.=<<___;.globl	Camellia_Ekeygen.type	Camellia_Ekeygen,\@function,3.align	16Camellia_Ekeygen:	push	%rbx	push	%rbp	push	%r13	push	%r14	push	%r15.Lkey_prologue:	mov	%rdi,$keyend		# put away arguments, keyBitLength	mov	%rdx,$out		# keyTable	mov	0(%rsi),@S[0]		# load 0-127 bits	mov	4(%rsi),@S[1]	mov	8(%rsi),@S[2]	mov	12(%rsi),@S[3]	bswap	@S[0]	bswap	@S[1]	bswap	@S[2]	bswap	@S[3]___	&_saveround	(0,$out,@S);	# KL<<<0$code.=<<___;	cmp	\$128,$keyend		# check keyBitLength	je	.L1st128	mov	16(%rsi),@S[0]		# load 128-191 bits	mov	20(%rsi),@S[1]	cmp	\$192,$keyend	je	.L1st192	mov	24(%rsi),@S[2]		# load 192-255 bits	mov	28(%rsi),@S[3]	jmp	.L1st256.L1st192:	mov	@S[0],@S[2]	mov	@S[1],@S[3]	not	@S[2]	not	@S[3].L1st256:	bswap	@S[0]	bswap	@S[1]	bswap	@S[2]	bswap	@S[3]___	&_saveround	(4,$out,@S);	# temp storage for KR!$code.=<<___;	xor	0($out),@S[1]		# KR^KL	xor	4($out),@S[0]	xor	8($out),@S[3]	xor	12($out),@S[2].L1st128:	lea	.LCamellia_SIGMA(%rip),$key	lea	.LCamellia_SBOX(%rip),$Tbl	mov	0($key),$t1	mov	4($key),$t0___	&Camellia_Feistel($step++);	&Camellia_Feistel($step++);$code.=<<___;	xor	0($out),@S[1]		# ^KL	xor	4($out),@S[0]	xor	8($out),@S[3]	xor	12($out),@S[2]___	&Camellia_Feistel($step++);	&Camellia_Feistel($step++);$code.=<<___;	cmp	\$128,$keyend	jne	.L2nd256	lea	128($out),$out		# size optimization	shl	\$32,%r8		# @S[0]||	shl	\$32,%r10		# @S[2]||	or	%r9,%r8			# ||@S[1]	or	%r11,%r10		# ||@S[3]___	&_loadround	(0,$out,-128,"%rax","%rbx");	# KL	&_saveround	(2,$out,-128,"%r8","%r10");	# KA<<<0	&_rotl128	("%rax","%rbx",15);	&_saveround	(4,$out,-128,"%rax","%rbx");	# KL<<<15	&_rotl128	("%r8","%r10",15);	&_saveround	(6,$out,-128,"%r8","%r10");	# KA<<<15	&_rotl128	("%r8","%r10",15);		# 15+15=30	&_saveround	(8,$out,-128,"%r8","%r10");	# KA<<<30	&_rotl128	("%rax","%rbx",30);		# 15+30=45	&_saveround	(10,$out,-128,"%rax","%rbx");	# KL<<<45	&_rotl128	("%r8","%r10",15);		# 30+15=45	&_saveround	(12,$out,-128,"%r8");		# KA<<<45	&_rotl128	("%rax","%rbx",15);		# 45+15=60	&_saveround	(13,$out,-128,"%rbx");		# KL<<<60	&_rotl128	("%r8","%r10",15);		# 45+15=60	&_saveround	(14,$out,-128,"%r8","%r10");	# KA<<<60	&_rotl128	("%rax","%rbx",17);		# 60+17=77	&_saveround	(16,$out,-128,"%rax","%rbx");	# KL<<<77	&_rotl128	("%rax","%rbx",17);		# 77+17=94	&_saveround	(18,$out,-128,"%rax","%rbx");	# KL<<<94	&_rotl128	("%r8","%r10",34);		# 60+34=94	&_saveround	(20,$out,-128,"%r8","%r10");	# KA<<<94	&_rotl128	("%rax","%rbx",17);		# 94+17=111	&_saveround	(22,$out,-128,"%rax","%rbx");	# KL<<<111	&_rotl128	("%r8","%r10",17);		# 94+17=111	&_saveround	(24,$out,-128,"%r8","%r10");	# KA<<<111$code.=<<___;	mov	\$3,%eax	jmp	.Ldone.align	16.L2nd256:___	&_saveround	(6,$out,@S);	# temp storage for KA!$code.=<<___;	xor	`4*8+0`($out),@S[1]	# KA^KR	xor	`4*8+4`($out),@S[0]	xor	`5*8+0`($out),@S[3]	xor	`5*8+4`($out),@S[2]___	&Camellia_Feistel($step++);	&Camellia_Feistel($step++);	&_loadround	(0,$out,"%rax","%rbx");	# KL	&_loadround	(4,$out,"%rcx","%rdx");	# KR	&_loadround	(6,$out,"%r14","%r15");	# KA$code.=<<___;	lea	128($out),$out		# size optimization	shl	\$32,%r8		# @S[0]||	shl	\$32,%r10		# @S[2]||	or	%r9,%r8			# ||@S[1]	or	%r11,%r10		# ||@S[3]___	&_saveround	(2,$out,-128,"%r8","%r10");	# KB<<<0	&_rotl128	("%rcx","%rdx",15);	&_saveround	(4,$out,-128,"%rcx","%rdx");	# KR<<<15	&_rotl128	("%r14","%r15",15);	&_saveround	(6,$out,-128,"%r14","%r15");	# KA<<<15	&_rotl128	("%rcx","%rdx",15);		# 15+15=30	&_saveround	(8,$out,-128,"%rcx","%rdx");	# KR<<<30	&_rotl128	("%r8","%r10",30);	&_saveround	(10,$out,-128,"%r8","%r10");	# KB<<<30

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -