📄 cmll-x86.pl
字号:
&lea ($key,&DWP(0,$key,"eax")); &mov (&DWP(5*4,"esp"),"ebx");# save %esp &call (&label("pic_point")); &set_label("pic_point"); &blindpop($Tbl); &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); &mov (@T[0],&DWP(0,$idx)); # load ciphertext &mov (@T[1],&DWP(4,$idx)); &mov (@T[2],&DWP(8,$idx)); &bswap (@T[0]); &mov (@T[3],&DWP(12,$idx)); &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &call ("_x86_Camellia_decrypt"); &mov ("esp",&DWP(5*4,"esp")); &bswap (@T[0]); &mov ($idx,&wparam(1)); # load plaintext pointer &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &mov (&DWP(0,$idx),@T[0]); # write plaintext &mov (&DWP(4,$idx),@T[1]); &mov (&DWP(8,$idx),@T[2]); &mov (&DWP(12,$idx),@T[3]);&function_end("Camellia_decrypt");}&function_begin_B("_x86_Camellia_decrypt"); &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] &xor (@T[1],&DWP(4,$key)); &xor (@T[2],&DWP(8,$key)); &xor (@T[3],&DWP(12,$key)); &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] &mov ($__s0,@T[0]); # save s[0-3] &mov ($__s1,@T[1]); &mov ($__s2,@T[2]); &mov ($__s3,@T[3]);&set_label("loop",16); for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } &sub ($key,16*4); &cmp ($key,$__end); &je (&label("done")); # @T[0-1] are preloaded, $idx is preloaded with key[2] &and ($idx,@T[0]); &mov (@T[3],$__s3); &rotl ($idx,1); &mov (@T[2],@T[3]); &xor (@T[1],$idx); &or (@T[2],&DWP(4,$key)); &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); &xor (@T[2],$__s2); &mov ($idx,&DWP(12,$key)); &mov ($__s2,@T[2]); # s2^=s3|key[3]; &or ($idx,@T[1]); &and (@T[2],&DWP(0,$key)); &xor (@T[0],$idx); &rotl (@T[2],1); &mov ($__s0,@T[0]); # s0^=s1|key[1]; &xor (@T[3],@T[2]); &mov ($idx,&DWP(-8,$key)); # prefetch key[4] &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); &jmp (&label("loop"));&set_label("done",8); &mov (@T[2],@T[0]); # SwapHalf &mov (@T[3],@T[1]); &mov (@T[0],$__s2); &mov (@T[1],$__s3); &xor (@T[2],$idx); # $idx is preloaded with key[2] &xor (@T[3],&DWP(12,$key)); &xor (@T[0],&DWP(0,$key)); &xor (@T[1],&DWP(4,$key)); &ret ();&function_end_B("_x86_Camellia_decrypt");# shld is very slow on Intel P4 family. Even on AMD it limits# instruction decode rate [because it's VectorPath] and consequently# performance. PIII, PM and Core[2] seem to be the only ones which# execute this code ~7% faster...sub __rotl128 { my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; $rnd *= 2; if ($rot) { &mov ($idx,$i0); &shld ($i0,$i1,$rot); &shld ($i1,$i2,$rot); &shld ($i2,$i3,$rot); &shld ($i3,$idx,$rot); } &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);}# ... Implementing 128-bit rotate without shld gives >3x performance# improvement on P4, only ~7% degradation on other Intel CPUs and# not worse performance on AMD. This is therefore preferred.sub _rotl128 { my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; $rnd *= 2; if ($rot) { &mov ($Tbl,$i0); &shl ($i0,$rot); &mov ($idx,$i1); &shr ($idx,32-$rot); &shl ($i1,$rot); &or ($i0,$idx); &mov ($idx,$i2); &shl ($i2,$rot); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); &shr ($idx,32-$rot); &or ($i1,$idx); &shr ($Tbl,32-$rot); &mov ($idx,$i3); &shr ($idx,32-$rot); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); &shl ($i3,$rot); &or ($i2,$idx); &or ($i3,$Tbl); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); } else { &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); }}sub _saveround {my ($rnd,$key,@T)=@_;my $bias=int(@T[0])?shift(@T):0; &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);}sub _loadround {my ($rnd,$key,@T)=@_;my $bias=int(@T[0])?shift(@T):0; &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3);}# void Camellia_Ekeygen(# const int keyBitLength,# const Byte *rawKey,# KEY_TABLE_TYPE keyTable)&function_begin("Camellia_Ekeygen");{ my $step=0; &stack_push(4); # place for s[0-3] &mov ($Tbl,&wparam(0)); # load arguments &mov ($idx,&wparam(1)); &mov ($key,&wparam(2)); &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits &mov (@T[1],&DWP(4,$idx)); &mov (@T[2],&DWP(8,$idx)); &mov (@T[3],&DWP(12,$idx)); &bswap (@T[0]); &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &_saveround (0,$key,@T); # KL<<<0 &cmp ($Tbl,128); &je (&label("1st128")); &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits &mov (@T[1],&DWP(20,$idx)); &cmp ($Tbl,192); &je (&label("1st192")); &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits &mov (@T[3],&DWP(28,$idx)); &jmp (&label("1st256"));&set_label("1st192",4); &mov (@T[2],@T[0]); &mov (@T[3],@T[1]); ¬ (@T[2]); ¬ (@T[3]);&set_label("1st256",4); &bswap (@T[0]); &bswap (@T[1]); &bswap (@T[2]); &bswap (@T[3]); &_saveround (4,$key,@T); # temporary storage for KR! &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL &xor (@T[1],&DWP(0*8+4,$key)); &xor (@T[2],&DWP(1*8+0,$key)); &xor (@T[3],&DWP(1*8+4,$key));&set_label("1st128",4); &call (&label("pic_point")); &set_label("pic_point"); &blindpop($Tbl); &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] &mov (&swtmp(0),@T[0]); # save s[0-3] &mov (&swtmp(1),@T[1]); &mov (&swtmp(2),@T[2]); &mov (&swtmp(3),@T[3]); &Camellia_Feistel($step++); &Camellia_Feistel($step++); &mov (@T[2],&swtmp(2)); &mov (@T[3],&swtmp(3)); &mov ($idx,&wparam(2)); &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL &xor (@T[1],&DWP(0*8+4,$idx)); &xor (@T[2],&DWP(1*8+0,$idx)); &xor (@T[3],&DWP(1*8+4,$idx)); &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] &mov (&swtmp(0),@T[0]); # save s[0-3] &mov (&swtmp(1),@T[1]); &mov (&swtmp(2),@T[2]); &mov (&swtmp(3),@T[3]); &Camellia_Feistel($step++); &Camellia_Feistel($step++); &mov (@T[2],&swtmp(2)); &mov (@T[3],&swtmp(3)); &mov ($idx,&wparam(0)); &cmp ($idx,128); &jne (&label("2nd256")); &mov ($key,&wparam(2)); &lea ($key,&DWP(128,$key)); # size optimization ####### process KA &_saveround (2,$key,-128,@T); # KA<<<0 &_rotl128 (@T,15,6,@T); # KA<<<15 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) ####### process KL &_loadround (0,$key,-128,@T); # load KL &_rotl128 (@T,15,4,@T); # KL<<<15 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) while (@T[0] ne "eax") # restore order { unshift (@T,pop(@T)); } &mov ("eax",3); # 3 grandRounds &jmp (&label("done"));&set_label("2nd256",16); &mov ($idx,&wparam(2)); &_saveround (6,$idx,@T); # temporary storage for KA! &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR &xor (@T[1],&DWP(4*8+4,$idx)); &xor (@T[2],&DWP(5*8+0,$idx)); &xor (@T[3],&DWP(5*8+4,$idx)); &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] &mov (&swtmp(0),@T[0]); # save s[0-3] &mov (&swtmp(1),@T[1]); &mov (&swtmp(2),@T[2]); &mov (&swtmp(3),@T[3]); &Camellia_Feistel($step++); &Camellia_Feistel($step++); &mov (@T[2],&swtmp(2)); &mov (@T[3],&swtmp(3)); &mov ($key,&wparam(2)); &lea ($key,&DWP(128,$key)); # size optimization ####### process KB &_saveround (2,$key,-128,@T); # KB<<<0 &_rotl128 (@T,30,10,@T); # KB<<<30 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) ####### process KR &_loadround (4,$key,-128,@T); # load KR &_rotl128 (@T,15,4,@T); # KR<<<15 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) ####### process KA &_loadround (6,$key,-128,@T); # load KA &_rotl128 (@T,15,6,@T); # KA<<<15 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) ####### process KL &_loadround (0,$key,-128,@T); # load KL push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) push (@T,shift(@T)); # rotl128(@T,32); &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) while (@T[0] ne "eax") # restore order { unshift (@T,pop(@T)); } &mov ("eax",4); # 4 grandRounds&set_label("done"); &lea ("edx",&DWP(272-128,$key)); # end of key schedule &stack_pop(4);}&function_end("Camellia_Ekeygen");if ($OPENSSL) {# int Camellia_set_key (# const unsigned char *userKey,# int bits,# CAMELLIA_KEY *key)&function_begin_B("Camellia_set_key"); &push ("ebx"); &mov ("ecx",&wparam(0)); # pull arguments &mov ("ebx",&wparam(1)); &mov ("edx",&wparam(2)); &mov ("eax",-1); &test ("ecx","ecx"); &jz (&label("done")); # userKey==NULL? &test ("edx","edx"); &jz (&label("done")); # key==NULL? &mov ("eax",-2); &cmp ("ebx",256); &je (&label("arg_ok")); # bits==256? &cmp ("ebx",192); &je (&label("arg_ok")); # bits==192? &cmp ("ebx",128); &jne (&label("done")); # bits!=128?&set_label("arg_ok",4); &push ("edx"); # push arguments &push ("ecx"); &push ("ebx"); &call ("Camellia_Ekeygen"); &stack_pop(3); # eax holds grandRounds and edx points at where to put it &mov (&DWP(0,"edx"),"eax"); &xor ("eax","eax");&set_label("done",4);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -