⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cmll-x86.pl

📁 OpenSSL 0.9.8k 最新版OpenSSL
💻 PL
📖 第 1 页 / 共 3 页
字号:
#!/usr/bin/env perl# ====================================================================# Copyright (c) 2008 Andy Polyakov <appro@openssl.org>## This module may be used under the terms of either the GNU General# Public License version 2 or later, the GNU Lesser General Public# License version 2.1 or later, the Mozilla Public License version# 1.1 or the BSD License. The exact terms of either license are# distributed along with this module. For further details see# http://www.openssl.org/~appro/camellia/.# ====================================================================# Performance in cycles per processed byte (less is better) in# 'openssl speed ...' benchmark:##			AMD K8	Core2	PIII	P4# -evp camellia-128-ecb	21.5	22.8	27.0	28.9# + over gcc 3.4.6	+90/11% +70/10%	+53/4%	+160/64%# + over icc 8.0	+48/19% +21/15%	+21/17%	+55/37%## camellia-128-cbc	17.3	21.1	23.9	25.9## 128-bit key setup	196	280	256	240	cycles/key# + over gcc 3.4.6	+30/0%	+17/11%	+11/0%	+63/40%# + over icc 8.0	+18/3%	+10/0%	+10/3%	+21/10%## Pairs of numbers in "+" rows represent performance improvement over# compiler generated position-independent code, PIC, and non-PIC# respectively. PIC results are of greater relevance, as this module# is position-independent, i.e. suitable for a shared library or PIE.# Position independence "costs" one register, which is why compilers# are so close with non-PIC results, they have an extra register to# spare. CBC results are better than ECB ones thanks to "zero-copy"# private _x86_* interface, and are ~30-40% better than with compiler# generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on# same CPU (where applicable).$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;push(@INC,"${dir}","${dir}../../perlasm");require "x86asm.pl";$OPENSSL=1;&asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");@T=("eax","ebx","ecx","edx");$idx="esi";$key="edi";$Tbl="ebp";# stack frame layout in _x86_Camellia_* routines, frame is allocated# by caller$__ra=&DWP(0,"esp");	# return address$__s0=&DWP(4,"esp");	# s0 backing store$__s1=&DWP(8,"esp");	# s1 backing store$__s2=&DWP(12,"esp");	# s2 backing store$__s3=&DWP(16,"esp");	# s3 backing store$__end=&DWP(20,"esp");	# pointer to end/start of key schedule# stack frame layout in Camellia_[en|crypt] routines, which differs from# above by 4 and overlaps by pointer to end/start of key schedule$_end=&DWP(16,"esp");$_esp=&DWP(20,"esp");# const unsigned int Camellia_SBOX[4][256];# Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],# and [2][] - with [3][]. This is done to optimize code size.$SBOX1_1110=0;		# Camellia_SBOX[0]$SBOX4_4404=4;		# Camellia_SBOX[1]$SBOX2_0222=2048;	# Camellia_SBOX[2]$SBOX3_3033=2052;	# Camellia_SBOX[3]&static_label("Camellia_SIGMA");&static_label("Camellia_SBOX");sub Camellia_Feistel {my $i=@_[0];my $seed=defined(@_[1])?@_[1]:0;my $scale=$seed<0?-8:8;my $frame=defined(@_[2])?@_[2]:0;my $j=($i&1)*2;my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];	&xor	($t0,$idx);				# t0^=key[0]	&xor	($t1,&DWP($seed+$i*$scale+4,$key));	# t1^=key[1]	&movz	($idx,&HB($t0));			# (t0>>8)&0xff	&mov	($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t3=SBOX3_3033[0]	&movz	($idx,&LB($t0));			# (t0>>0)&0xff	&xor	($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t3^=SBOX4_4404[0]	&shr	($t0,16);	&movz	($idx,&LB($t1));			# (t1>>0)&0xff	&mov	($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t2=SBOX1_1110[1]	&movz	($idx,&HB($t0));			# (t0>>24)&0xff	&xor	($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));	# t3^=SBOX1_1110[0]	&movz	($idx,&HB($t1));			# (t1>>8)&0xff	&xor	($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));	# t2^=SBOX4_4404[1]	&shr	($t1,16);	&movz	($t0,&LB($t0));				# (t0>>16)&0xff	&xor	($t3,&DWP($SBOX2_0222,$Tbl,$t0,8));	# t3^=SBOX2_0222[0]	&movz	($idx,&HB($t1));			# (t1>>24)&0xff	&mov	($t0,&DWP($frame+4*(($j+3)%4),"esp"));	# prefetch "s3"	&xor	($t2,$t3);				# t2^=t3	&rotr	($t3,8);				# t3=RightRotate(t3,8)	&xor	($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));	# t2^=SBOX2_0222[1]	&movz	($idx,&LB($t1));			# (t1>>16)&0xff	&mov	($t1,&DWP($frame+4*(($j+2)%4),"esp"));	# prefetch "s2"	&xor	($t3,$t0);				# t3^=s3	&xor	($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));	# t2^=SBOX3_3033[1]	&mov	($idx,&DWP($seed+($i+1)*$scale,$key));	# prefetch key[i+1]	&xor	($t3,$t2);				# t3^=t2	&mov	(&DWP($frame+4*(($j+3)%4),"esp"),$t3);	# s3=t3	&xor	($t2,$t1);				# t2^=s2	&mov	(&DWP($frame+4*(($j+2)%4),"esp"),$t2);	# s2=t2}# void Camellia_EncryptBlock_Rounds(#		int grandRounds,#		const Byte plaintext[],#		const KEY_TABLE_TYPE keyTable,#		Byte ciphertext[])&function_begin("Camellia_EncryptBlock_Rounds");	&mov	("eax",&wparam(0));	# load grandRounds	&mov	($idx,&wparam(1));	# load plaintext pointer	&mov	($key,&wparam(2));	# load key schedule pointer	&mov	("ebx","esp");	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra	&and	("esp",-64);	# place stack frame just "above mod 1024" the key schedule	# this ensures that cache associativity of 2 suffices	&lea	("ecx",&DWP(-64-63,$key));	&sub	("ecx","esp");	&neg	("ecx");	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line	&sub	("esp","ecx");	&add	("esp",4);	# 4 is reserved for callee's return address	&shl	("eax",6);	&lea	("eax",&DWP(0,$key,"eax"));	&mov	($_esp,"ebx");	# save %esp	&mov	($_end,"eax");	# save keyEnd	&call	(&label("pic_point"));	&set_label("pic_point");	&blindpop($Tbl);	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));	&mov	(@T[0],&DWP(0,$idx));	# load plaintext	&mov	(@T[1],&DWP(4,$idx));	&mov	(@T[2],&DWP(8,$idx));	&bswap	(@T[0]);	&mov	(@T[3],&DWP(12,$idx));	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&call	("_x86_Camellia_encrypt");	&mov	("esp",$_esp);	&bswap	(@T[0]);	&mov	($idx,&wparam(3));	# load ciphertext pointer	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext	&mov	(&DWP(4,$idx),@T[1]);	&mov	(&DWP(8,$idx),@T[2]);	&mov	(&DWP(12,$idx),@T[3]);&function_end("Camellia_EncryptBlock_Rounds");# V1.x API&function_begin_B("Camellia_EncryptBlock");	&mov	("eax",128);	&sub	("eax",&wparam(0));	# load keyBitLength	&mov	("eax",3);	&adc	("eax",0);		# keyBitLength==128?3:4	&mov	(&wparam(0),"eax");	&jmp	(&label("Camellia_EncryptBlock_Rounds"));&function_end_B("Camellia_EncryptBlock");if ($OPENSSL) {# void Camellia_encrypt(#		const unsigned char *in,#		unsigned char *out,#		const CAMELLIA_KEY *key)&function_begin("Camellia_encrypt");	&mov	($idx,&wparam(0));	# load plaintext pointer	&mov	($key,&wparam(2));	# load key schedule pointer	&mov	("ebx","esp");	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra	&and	("esp",-64);	&mov	("eax",&DWP(272,$key));	# load grandRounds counter	# place stack frame just "above mod 1024" the key schedule	# this ensures that cache associativity of 2 suffices	&lea	("ecx",&DWP(-64-63,$key));	&sub	("ecx","esp");	&neg	("ecx");	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line	&sub	("esp","ecx");	&add	("esp",4);	# 4 is reserved for callee's return address	&shl	("eax",6);	&lea	("eax",&DWP(0,$key,"eax"));	&mov	($_esp,"ebx");	# save %esp	&mov	($_end,"eax");	# save keyEnd	&call	(&label("pic_point"));	&set_label("pic_point");	&blindpop($Tbl);	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));	&mov	(@T[0],&DWP(0,$idx));	# load plaintext	&mov	(@T[1],&DWP(4,$idx));	&mov	(@T[2],&DWP(8,$idx));	&bswap	(@T[0]);	&mov	(@T[3],&DWP(12,$idx));	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&call	("_x86_Camellia_encrypt");	&mov	("esp",$_esp);	&bswap	(@T[0]);	&mov	($idx,&wparam(1));	# load ciphertext pointer	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&mov	(&DWP(0,$idx),@T[0]);	# write ciphertext	&mov	(&DWP(4,$idx),@T[1]);	&mov	(&DWP(8,$idx),@T[2]);	&mov	(&DWP(12,$idx),@T[3]);&function_end("Camellia_encrypt");}&function_begin_B("_x86_Camellia_encrypt");	&xor	(@T[0],&DWP(0,$key));	# ^=key[0-3]	&xor	(@T[1],&DWP(4,$key));	&xor	(@T[2],&DWP(8,$key));	&xor	(@T[3],&DWP(12,$key));	&mov	($idx,&DWP(16,$key));	# prefetch key[4]	&mov	($__s0,@T[0]);		# save s[0-3]	&mov	($__s1,@T[1]);	&mov	($__s2,@T[2]);	&mov	($__s3,@T[3]);&set_label("loop",16);	for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }	&add	($key,16*4);	&cmp	($key,$__end);	&je	(&label("done"));	# @T[0-1] are preloaded, $idx is preloaded with key[0]	&and	($idx,@T[0]);	 &mov	 (@T[3],$__s3);	&rotl	($idx,1);	 &mov	 (@T[2],@T[3]);	&xor	(@T[1],$idx);	 &or	 (@T[2],&DWP(12,$key));	&mov	($__s1,@T[1]);		# s1^=LeftRotate(s0&key[0],1);	 &xor	 (@T[2],$__s2);	&mov	($idx,&DWP(4,$key));	 &mov	 ($__s2,@T[2]);		# s2^=s3|key[3];	&or	($idx,@T[1]);	 &and	 (@T[2],&DWP(8,$key));	&xor	(@T[0],$idx);	 &rotl	 (@T[2],1);	&mov	($__s0,@T[0]);		# s0^=s1|key[1];	 &xor	 (@T[3],@T[2]);	&mov	($idx,&DWP(16,$key));		# prefetch key[4]	 &mov	 ($__s3,@T[3]);		# s3^=LeftRotate(s2&key[2],1);	&jmp	(&label("loop"));&set_label("done",8);	&mov	(@T[2],@T[0]);		# SwapHalf	&mov	(@T[3],@T[1]);	&mov	(@T[0],$__s2);	&mov	(@T[1],$__s3);	&xor	(@T[0],$idx);		# $idx is preloaded with key[0]	&xor	(@T[1],&DWP(4,$key));	&xor	(@T[2],&DWP(8,$key));	&xor	(@T[3],&DWP(12,$key));	&ret	();&function_end_B("_x86_Camellia_encrypt");# void Camellia_DecryptBlock_Rounds(#		int grandRounds,#		const Byte ciphertext[],#		const KEY_TABLE_TYPE keyTable,#		Byte plaintext[])&function_begin("Camellia_DecryptBlock_Rounds");	&mov	("eax",&wparam(0));	# load grandRounds	&mov	($idx,&wparam(1));	# load ciphertext pointer	&mov	($key,&wparam(2));	# load key schedule pointer	&mov	("ebx","esp");	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra	&and	("esp",-64);	# place stack frame just "above mod 1024" the key schedule	# this ensures that cache associativity of 2 suffices	&lea	("ecx",&DWP(-64-63,$key));	&sub	("ecx","esp");	&neg	("ecx");	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line	&sub	("esp","ecx");	&add	("esp",4);	# 4 is reserved for callee's return address	&shl	("eax",6);	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart	&lea	($key,&DWP(0,$key,"eax"));	&mov	(&DWP(5*4,"esp"),"ebx");# save %esp	&call	(&label("pic_point"));	&set_label("pic_point");	&blindpop($Tbl);	&lea	($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));	&mov	(@T[0],&DWP(0,$idx));	# load ciphertext	&mov	(@T[1],&DWP(4,$idx));	&mov	(@T[2],&DWP(8,$idx));	&bswap	(@T[0]);	&mov	(@T[3],&DWP(12,$idx));	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&call	("_x86_Camellia_decrypt");	&mov	("esp",&DWP(5*4,"esp"));	&bswap	(@T[0]);	&mov	($idx,&wparam(3));	# load plaintext pointer	&bswap	(@T[1]);	&bswap	(@T[2]);	&bswap	(@T[3]);	&mov	(&DWP(0,$idx),@T[0]);	# write plaintext	&mov	(&DWP(4,$idx),@T[1]);	&mov	(&DWP(8,$idx),@T[2]);	&mov	(&DWP(12,$idx),@T[3]);&function_end("Camellia_DecryptBlock_Rounds");# V1.x API&function_begin_B("Camellia_DecryptBlock");	&mov	("eax",128);	&sub	("eax",&wparam(0));	# load keyBitLength	&mov	("eax",3);	&adc	("eax",0);		# keyBitLength==128?3:4	&mov	(&wparam(0),"eax");	&jmp	(&label("Camellia_DecryptBlock_Rounds"));&function_end_B("Camellia_DecryptBlock");if ($OPENSSL) {# void Camellia_decrypt(#		const unsigned char *in,#		unsigned char *out,#		const CAMELLIA_KEY *key)&function_begin("Camellia_decrypt");	&mov	($idx,&wparam(0));	# load ciphertext pointer	&mov	($key,&wparam(2));	# load key schedule pointer	&mov	("ebx","esp");	&sub	("esp",7*4);		# place for s[0-3],keyEnd,esp and ra	&and	("esp",-64);	&mov	("eax",&DWP(272,$key));	# load grandRounds counter	# place stack frame just "above mod 1024" the key schedule	# this ensures that cache associativity of 2 suffices	&lea	("ecx",&DWP(-64-63,$key));	&sub	("ecx","esp");	&neg	("ecx");	&and	("ecx",0x3C0);	# modulo 1024, but aligned to cache-line	&sub	("esp","ecx");	&add	("esp",4);	# 4 is reserved for callee's return address	&shl	("eax",6);	&mov	(&DWP(4*4,"esp"),$key);	# save keyStart

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -