📄 2fish_86.asm
字号:
mov ecx,lSubkey[4*edi+SUBKEY_SIZE*2]
xor ebx,edx
mov edx,lSubkey[4*edi+SUBKEY_SIZE*2+4]
xor eax,ecx
mov ecx,lSubkey[4*edi+SUBKEY_SIZE*3]
xor ebx,edx
mov edx,lSubkey[4*edi+SUBKEY_SIZE*3+4]
xor eax,ecx
xor ebx,edx
rol ebx,8
add eax,ebx
add ebx,eax
mov ks.subKeys[4*edi ],eax
rol ebx,9
mov ks.subKeys[4*edi+4],ebx
sub edi,2
jae subkeyLp_&cpuName
; jmpRet cpuName ;; 2207 to here (MMX:1370)(Pro: 1182) ["subKey"]
if (KEY_MODE and KM_ZERO) eq 0
; now build the 8x32 S-boxes (including MDS matrix)
lea esi,tmpSbox+128
mov ecx,kLen64
add edi,2+3 ;set edi == 3 (use 8-bit "offset")
mov edx,mdsJmpTab_&cpuName[4*ecx-4]
mov esi,pPtrTab[4*ecx-4]
mov reKeyJmpPtr,edx
lea ecx,SboxKey[4*ecx-4] ;point to first dword of key used
mov pPtr,esi
mov kPtr,ecx
SboxLp_&cpuName:
; here with edi = byte number, esi = pointer into pTab "first" used entry
; ecx = kPtr
xor edx,edx
mov esi,[esi+4*edi] ;get ptr to first of q0/q1 to be used
mov dl,[ecx+edi] ;get the key byte
ifdif <cpuName>,<PentiumPro>
mov dh,dl
mov eax,edx
shl edx,16
mov ecx,256-32
or edx,eax
else
mov eax,1010101h
mul edx
mov ecx,256-32
mov edx,eax
endif
p1stLp_&cpuName: ;do the first level of perm8x8/xor
irp _K_,<0,8,16,24>
pXor8 tmpSbox[ecx],[esi+ecx],edx,%(_K_)
endm
sub ecx,32
jae p1stLp_&cpuName
xor ebx,ebx ;clear upper bits
jmp reKeyJmpPtr ;go handle the remaining levels
mds256_&cpuName:
ld8 <b>,<byte ptr SboxKey[edi+8]>,cpuName
mov esi,pTab[4*edi+32]
call permXor_&cpuName
mds192_&cpuName:
ld8 <b>,<byte ptr SboxKey[edi+4]>,cpuName
mov esi,pTab[4*edi+16]
call permXor_&cpuName
mds128_&cpuName:
if KEY_MODE and (KM_FULL or KM_PART or KM_COMPILE)
ld8 <b>,<byte ptr SboxKey[edi]>,cpuName
mov esi,pTab[4*edi]
call permXor_&cpuName ;compute final stage of 8-bit S-box
endif
if KEY_MODE and (KM_FULL or KM_COMPILE);---------;now expand from 8-bit to 32 via MDStab
mov tmp0,ebp
lea ebp,ks.fullSbox
mov tmp1,edi
mov eax,mdsOffsTab[4*edi]
shl edi,10
add ebp,eax ;ebp --> base of output table
xor eax,eax ;clear upper bits of eax
mov al,tmpSbox8[256-1]
xor ebx,ebx
mov bl,tmpSbox8[256-1-4]
add edi,offset MDStab ;base of this table in MDStab
mov esi,256-8
mds64Lp_&cpuName:
ifdif <cpuName>,<PentiumPro>
mov ecx,[ebp+8*esi] ;force cache line load for writes
mov edx,[ebp+8*esi+32] ;(no need for Pro)
endif
irp NN,<3,2,1,0>
mov ecx,[edi+4*eax] ;get the MDS table entry
mov edx,[edi+4*ebx]
if NN
ld8 <a>,tmpSbox8[esi+NN-1+4],cpuName ;load from different cache banks
ld8 <b>,tmpSbox8[esi+NN-1],cpuName
else
ld8 <a>,tmpSbox8[esi-1],cpuName ;get set for next time
ld8 <b>,tmpSbox8[esi-1-4],cpuName
endif
mov [ebp+8*esi+8*NN+8*4],ecx;store it in the Sbox under construction
mov [ebp+8*esi+8*NN],edx
endm
sub esi,8
jns mds64Lp_&cpuName
mov edi,tmp1 ;restore regs
mov ebp,tmp0
elseif KEY_MODE and (KM_PART or KM_MIN) ;-------;just copy the 8-bit Sbox
mov ecx,x8Tab[edi*4]
mov esi,256-64
lea ecx,ks.fullSbox[ecx]
pk_CopyLp_&cpuName:
ifdif <cpuName>,<PentiumPro> ;force load cache line before writing
mov eax,[ecx+esi]
mov ebx,[ecx+esi+36]
endif
irp NN,<7,6,5,4,3,2,1,0>
mov eax,tmpSbox[esi+NN*8]
mov ebx,tmpSbox[esi+NN*8+4]
mov [ecx+esi+NN*8],eax
mov [ecx+esi+NN*8+4],ebx
endm
sub esi,64
jae pk_CopyLp_&cpuName
endif
mov ecx,kPtr ;get ready for next time
mov esi,pPtr
dec edi ;decrement byte number
jns SboxLp_&cpuName ;keep going until all bytes processed
; jmpRet cpuName ;;10000 to here (MMX:6410)(Pro: 7711) ["All"]
if KEY_MODE and KM_COMPILE ;copy the code over and "compile" it
cmp ks.keySig,VALID_SIG ;must be correct C model as well
jnz badKeyLen_&cpuName ;hang if not
cld
lea esi,cipherProcStart_&cpuName
lea ecx,cipherProcEnd_&cpuName+63
mov edx,esi
lea edi,ks.cipherProcCode[63]
and esi,NOT 63 ;put on 64 byte boundaries
and ecx,NOT 63
and edi,NOT 63 ;keep on paragraph boundaries
sub ecx,esi
sub edx,esi ;edx=how much we moved down
add edx,edi ;edx=where cipherProcStart is in cipherProcCode
cmp ecx,ks.codeSize ;is there room?
ja badKeyLen_&cpuName ;hang if not
shr ecx,2 ;do it one dword at a time
sub edx,offset cipherProcStart_&cpuName ;edx=value to add to translate offsets
ifdif <cpuName>,<PentiumPro>
cpyLp:
mov eax,[edi] ;load cache lines to speed things up
mov ebx,[edi+36]
irp QQ,<0,8,16,24,32,40,48,56>
mov eax,[esi+QQ] ;move 8 bytes at a time
mov ebx,[esi+QQ+4]
mov [edi+QQ],eax
mov [edi+QQ+4],ebx
endm
add esi,64
add edi,64
sub ecx,16
jae cpyLp
else
rep movsd ;copy over the code
endif
; now cipherProcStart is at edx! Patch in the
lea eax,TwoFishEncrypt_&cpuName[edx]
lea ebx,TwoFishDecrypt_&cpuName[edx]
mov ks.encryptFuncPtr,eax ;set up the function pointers
mov ks.decryptFuncPtr,ebx
xor ecx,ecx
mov eax,ks.subKeys[INPUT_WHITEN]
mov ebx,ks.subKeys[INPUT_WHITEN+4]
mov dword ptr Enc_CBC_SK_0_&cpuName[edx-4],eax
mov dword ptr Enc_CBC_SK_1_&cpuName[edx-4],ebx
mov dword ptr Dec_CBC_SK_0_&cpuName[edx-4],eax
mov dword ptr Dec_CBC_SK_1_&cpuName[edx-4],ebx
mov eax,ks.subKeys[INPUT_WHITEN+8]
mov ebx,ks.subKeys[INPUT_WHITEN+12]
mov dword ptr Enc_CBC_SK_2_&cpuName[edx-4],eax
mov dword ptr Enc_CBC_SK_3_&cpuName[edx-4],ebx
mov dword ptr Dec_CBC_SK_2_&cpuName[edx-4],eax
mov dword ptr Dec_CBC_SK_3_&cpuName[edx-4],ebx
patchLp_&cpuName:
mov eax,PatchList_Enc_&cpuName[4*ecx] ;now patch in the subkeys
mov ebx,PatchList_Dec_&cpuName[4*ecx]
mov edi,ks.subKeys[4*ecx];
inc ecx
mov [eax+edx],edi
mov [ebx+edx],edi
cmp ecx,TOTAL_SUBKEYS
jb patchLp_&cpuName
endif
endif ;!KM_ZERO
reKeyDone_&cpuName:
add esp,localSize2
popad
mov eax,1 ;success
ret
;
; Input: tmpSbox permutation (at _S_, due to call return address)
; esi --> q0 or q1, permutation through which to pass tmpSbox
; ebx = key byte to xor into data after q0/q1
; Output: tmpSbox updated
; edi,ebp unmodified
permXor_&cpuName:
ifdif <cpuName>,<PentiumPro>
push edi ;save edi on stack so we don't modify it
mov bh,bl
push ebp
mov ebp,ebx
shl ebx,16
xor eax,eax
mov edi,256-8
mov al,_S_[256-1]
or ebp,ebx ;ebp=four bytes replicated from original ebx
xor ebx,ebx
mov bl,_S_[256-1-4]
pxLoop_Pentium:
irp NN,<0>
mov ch,[esi+eax]
mov al,_S_[edi+6+NN]
mov dh,[esi+ebx]
mov bl,_S_[edi+2+NN]
mov cl,[esi+eax]
mov al,_S_[edi+5+NN]
shl ecx,16
mov dl,[esi+ebx]
shl edx,16
mov bl,_S_[edi+1+NN]
mov ch,[esi+eax]
mov al,_S_[edi+4+NN]
mov dh,[esi+ebx]
mov bl,_S_[edi+NN]
mov cl,[esi+eax]
mov al,_S_[edi-1+NN]
mov dl,[esi+ebx]
mov bl,_S_[edi-5+NN]
xor ecx,ebp
xor edx,ebp
mov dword ptr _S_[edi+4+NN],ecx
mov dword ptr _S_[edi+NN],edx
endm
sub edi,8
jae pxLoop_Pentium
else
push edi
push ebp
mov ebp,ebx
mov edi,256-8
shl ebp,8
or ebp,ebx
mov ebx,ebp
shl ebp,16
mov ecx,dword ptr _S_[edi+4];pick up next four bytes of permutation
or ebp,ebx ;ebp=four bytes replicated from original ebx
mov eax,ecx
movzx edx,ch
pxLoop_PentiumPro:
irp ZZ,<4,0>
movzx ecx,cl
shr eax,16
movzx edx,byte ptr [esi+edx]
shl edx,8
movzx ecx,byte ptr [esi+ecx]
movzx ebx,ah
movzx ebx,byte ptr [esi+ebx]
movzx eax,al
xor ecx,ebp
shl ebx,24
movzx eax,byte ptr [esi+eax]
xor edx,ecx
shl eax,16
xor ebx,edx
mov ecx,dword ptr _S_[edi+ZZ-4] ;get ready for next loop
xor ebx,eax
mov eax,ecx
movzx edx,ch
mov dword ptr _S_[edi+ZZ],ebx
endm
sub edi,8
jae pxLoop_PentiumPro
endif
pop ebp
pop edi
ret
;
badKeyLen_&cpuName: ; will only come here if there is an invalid keyLen parameter
int 3
add esp,localSize2
popad
xor eax,eax
ret
if (KEY_MODE and KM_ZERO) eq 0
dd badKeyLen_&cpuName
mdsJmpTab_&cpuName dd badKeyLen_&cpuName
dd mds128_&cpuName,mds192_&cpuName,mds256_&cpuName
endif
skJmpTab_&cpuName dd badKeyLen_&cpuName
dd sk128_&cpuName,sk192_&cpuName,sk256_&cpuName
reKeyCode_&cpuName = ($-reKey_Start_&cpuName)+reKeySharedSize
irp cby,<%(reKeyCode_&cpuName)>
%out --- ReKey assembly code size == cby bytes (cpuName)
endm
irp cby,<%(bigKeyCode_&cpuName)>
%out --- Big key assembly code size == cby bytes (cpuName)
endm
irp cby,<%(reKeyCode_&cpuName-bigKeyCode_&cpuName)>
%out --- 128 key assembly code size == cby bytes (cpuName)
endm
irp cby,<%($-asmStart_&cpuName+reKeySharedSize)>
%out --- Total assembly code size == cby bytes (cpuName)
%out
endm
TwoFishReKey_&cpuName endp
endm ;reKeyProc
;
;__MACRO_DEF_DONE__ ;sign to LST2ASM that macro definitions are done
;
;
;-------------------- Pentium (and 486) optimized -------------
;
asmStart_Pentium:
cipherProc Pentium
reKeyProc Pentium
asmEnd_Pentium:
;
;----------------- Pentium Pro/Pentium II optimized ------------
;
asmStart_PentiumPro:
cipherProc PentiumPro
reKeyProc PentiumPro
asmEnd_PentiumPro:
;
;----------------- Select which routines to use ---------------
;
; useAsm: bit 0 --> enable encryption in ASM
; bit 1 --> enable decryption in ASM
; bit 2 --> enable reKey in ASM
; If any other bits are set (e.g., useAsm >= 8),
; the low three bits get discarded here (SHR), and
; the remaining value is used as a pseudo-return
; value from get_cpu_type, to allow command-line
; override of the cpu type:
; useAsm=8..47--> force Pentium code
; useASM > 47 --> force PentiumPro code
;
extrn get_cpu_type:near
extrn useAsm:byte
;
public TwoFishEncrypt_Pentium,TwoFishDecrypt_Pentium,TwoFishReKey_Pentium
public TwoFishEncrypt_PentiumPro,TwoFishDecrypt_PentiumPro,TwoFishReKey_PentiumPro
public TwofishAsmCodeSize
setProcPtrs proc
pushad
; set default (Pentium) addresses
mov blockEncrypt_86,offset TwoFishEncrypt_Pentium
mov blockDecrypt_86,offset TwoFishDecrypt_Pentium
mov reKey_86 ,offset TwoFishReKey_Pentium
call get_cpu_type
xor al,5
iff nz
mov al,1
endi
test useAsm,8 ;command line override of CPU type?
iff nz
xor al,1
endi
or al,al
iff nz
mov blockEncrypt_86,offset TwoFishEncrypt_PentiumPro
mov blockDecrypt_86,offset TwoFishDecrypt_PentiumPro
mov reKey_86 ,offset TwoFishReKey_PentiumPro
endi
popad
ret
setProcPtrs endp
E_SelectCPU proc ;only come here once
call setProcPtrs
jmp blockEncrypt_86 ;use the ptr we just set up
E_SelectCPU endp
;
D_SelectCPU proc
call setProcPtrs
jmp blockDecrypt_86
D_SelectCPU endp
R_SelectCPU proc
call setProcPtrs
jmp reKey_86
R_SelectCPU endp
; code size functions
TwofishAsmCodeSize proc
call setProcPtrs
mov eax,offset asmEnd_Pentium-asmStart_Pentium
cmp blockEncrypt_86,offset TwoFishEncrypt_PentiumPro
iff z
mov eax,offset asmEnd_PentiumPro-asmStart_PentiumPro
endi
add eax,reKeySharedSize
ret
TwofishAsmCodeSize endp
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -