📄 2fish_86.asm
字号:
biasEBP
; now run subkey s-box bytes through MDS and rotate/PHT/combine
subkeyLp_&cpuName:
mov eax,lSubkey[4*edi]
mov ebx,lSubkey[4*edi+4]
mov ecx,lSubkey[4*edi+SUBKEY_SIZE]
mov edx,lSubkey[4*edi+SUBKEY_SIZE+4]
xor eax,ecx
mov ecx,lSubkey[4*edi+SUBKEY_SIZE*2]
xor ebx,edx
mov edx,lSubkey[4*edi+SUBKEY_SIZE*2+4]
xor eax,ecx
mov ecx,lSubkey[4*edi+SUBKEY_SIZE*3]
xor ebx,edx
mov edx,lSubkey[4*edi+SUBKEY_SIZE*3+4]
xor eax,ecx
xor ebx,edx
rol ebx,8
add eax,ebx
add ebx,eax
mov ks.subKeys[4*edi ],eax
rol ebx,9
mov ks.subKeys[4*edi+4],ebx
sub edi,2
jae subkeyLp_&cpuName
; jmpRet cpuName ;; 2207 to here (MMX:1370)(Pro: 1182) ["subKey"]
if (KEY_MODE and KM_ZERO) eq 0
; now build the 8x32 S-boxes (including MDS matrix)
lea esi,tmpSbox+128
mov ecx,kLen64
add edi,2+3 ;set edi == 3 (use 8-bit "offset")
mov edx,mdsJmpTab_&cpuName[4*ecx-4]
mov esi,pPtrTab[4*ecx-4]
mov reKeyJmpPtr,edx
lea ecx,SboxKey[4*ecx-4] ;point to first dword of key used
mov pPtr,esi
mov kPtr,ecx
SboxLp_&cpuName:
; here with edi = byte number, esi = pointer into pTab "first" used entry
; ecx = kPtr
if BIG_TAB
xor eax,eax
mov esi,[esi+4*edi] ;get ptr to big table entry
mov al,[ecx+edi] ;get the key byte
shl eax,8 ;offset in the 64K table
xor edx,edx
add esi,eax
mov dl,[ecx+edi-4] ;get "next" key byte
else
xor edx,edx
mov esi,[esi+4*edi] ;get ptr to first of q0/q1 to be used
mov dl,[ecx+edi] ;get the key byte
endif
ifdif <cpuName>,<PentiumPro>
mov dh,dl
mov eax,edx
shl edx,16
mov ecx,256-32
or edx,eax
else
mov eax,1010101h
mul edx
mov ecx,256-32
mov edx,eax
endif
p1stLp_&cpuName: ;do the first level of perm8x8/xor
irp _K_,<0,8,16,24>
pXor8 tmpSbox[ecx],[esi+ecx],edx,%(_K_)
endm
sub ecx,32
jae p1stLp_&cpuName
xor ebx,ebx ;clear upper bits
jmp reKeyJmpPtr ;go handle the remaining levels
mds256_&cpuName:
ld8 <b>,<byte ptr SboxKey[edi+8]>,cpuName
mov esi,pTab[4*edi+32]
call permXor_&cpuName
mds192_&cpuName:
ld8 <b>,<byte ptr SboxKey[edi+4]>,cpuName
mov esi,pTab[4*edi+16]
call permXor_&cpuName
mds128_&cpuName:
if KEY_MODE and (KM_FULL or KM_PART or KM_COMPILE)
ld8 <b>,<byte ptr SboxKey[edi]>,cpuName
mov esi,pTab[4*edi]
call permXor_&cpuName ;compute final stage of 8-bit S-box
endif
sbox8Done_&cpuName:
if KEY_MODE and (KM_FULL or KM_COMPILE);---------;now expand from 8-bit to 32 via MDStab
mov tmp0,ebp
lea ebp,ks.fullSbox
mov tmp1,edi
mov eax,mdsOffsTab[4*edi]
shl edi,10
add ebp,eax ;ebp --> base of output table
xor eax,eax ;clear upper bits of eax
mov al,tmpSbox8[256-1]
xor ebx,ebx
mov bl,tmpSbox8[256-1-4]
add edi,offset MDStab ;base of this table in MDStab
mov esi,256-8
mds64Lp_&cpuName:
ifdif <cpuName>,<PentiumPro>
mov ecx,[ebp+8*esi] ;force cache line load for writes
mov edx,[ebp+8*esi+32] ;(no need for Pro)
endif
irp NN,<3,2,1,0>
mov ecx,[edi+4*eax] ;get the MDS table entry
mov edx,[edi+4*ebx]
if NN
ld8 <a>,tmpSbox8[esi+NN-1+4],cpuName ;load from different cache banks
ld8 <b>,tmpSbox8[esi+NN-1],cpuName
else
ld8 <a>,tmpSbox8[esi-1],cpuName ;get set for next time
ld8 <b>,tmpSbox8[esi-1-4],cpuName
endif
mov [ebp+8*esi+8*NN+8*4],ecx;store it in the Sbox under construction
mov [ebp+8*esi+8*NN],edx
endm
sub esi,8
jns mds64Lp_&cpuName
mov edi,tmp1 ;restore regs
mov ebp,tmp0
elseif KEY_MODE and (KM_PART or KM_MIN) ;-------;just copy the 8-bit Sbox
mov ecx,x8Tab[edi*4]
mov esi,256-64
lea ecx,ks.fullSbox[ecx]
pk_CopyLp_&cpuName:
ifdif <cpuName>,<PentiumPro> ;force load cache line before writing
mov eax,[ecx+esi]
mov ebx,[ecx+esi+36]
endif
irp NN,<7,6,5,4,3,2,1,0>
mov eax,tmpSbox[esi+NN*8]
mov ebx,tmpSbox[esi+NN*8+4]
mov [ecx+esi+NN*8],eax
mov [ecx+esi+NN*8+4],ebx
endm
sub esi,64
jae pk_CopyLp_&cpuName
endif ;elseif
mov ecx,kPtr ;get ready for next time
mov esi,pPtr
dec edi ;decrement byte number
jns SboxLp_&cpuName ;keep going until all bytes processed
; jmpRet cpuName ;;10000 to here (MMX:6410)(Pro: 7711) ["All"]
if KEY_MODE and KM_COMPILE ;copy the code over and "compile" it
cmp ks.keySig,VALID_SIG ;must be correct C model as well
jnz badKeyLen_&cpuName ;hang if not
cld
lea esi,cipherProcStart_&cpuName
lea ecx,cipherProcEnd_&cpuName+MOVS_MASK
mov edx,esi
lea edi,ks.cipherProcCode[MOVS_MASK]
and esi,NOT MOVS_MASK ;put on (MOVS_MASK+1) byte boundaries
and ecx,NOT MOVS_MASK ;(for fastest block move)
and edi,NOT MOVS_MASK ;keep on paragraph boundaries
sub ecx,esi
sub edx,esi ;edx=how much we moved down
add edx,edi ;edx=where cipherProcStart is in cipherProcCode
cmp ecx,ks.codeSize ;is there room?
ja badKeyLen_&cpuName ;hang if not
shr ecx,2 ;do it one dword at a time
sub edx,offset cipherProcStart_&cpuName ;edx=value to add to translate offsets
cmp ks.cSig1,CSIG_1 ;have we already compiled the code here?
iff z ;if so, don't re-copy the code
cmp ks.cSig2,CSIG_2 ;make sure the signatures match
ifbrk nz
lea eax,ks.cipherProcCode ;are proc ptrs in range?
cmp eax,ks.encryptFuncPtr ;if not, we need to compile
ifbrk a
cmp eax,ks.decryptFuncPtr
ifbrk a
add eax,ks.codeSize
cmp eax,ks.encryptFuncPtr
ifbrk b
cmp eax,ks.decryptFuncPtr
ifbrk b
jmp copyDone_&cpuName ;if so, don't copy again, just patch in subkeys
endi
mov ks.cSig1,CSIG_1
mov ks.cSig2,CSIG_2
ifdif <cpuName>,<PentiumPro>
cpyLp:
mov eax,[edi] ;load Pentium cache lines to speed things up
mov ebx,[edi+36]
irp QQ,<0,8,16,24,32,40,48,56>
mov eax,[esi+QQ] ;move 8 bytes at a time
mov ebx,[esi+QQ+4]
mov [edi+QQ],eax
mov [edi+QQ+4],ebx
endm
add esi,64
add edi,64
sub ecx,16
jae cpyLp
else
rep movsd ;copy over the code
endif
; now cipherProcStart is at edx! Patch in the subkey values as immediates
copyDone_&cpuName:
lea eax,TwoFishEncrypt_&cpuName[edx]
lea ebx,TwoFishDecrypt_&cpuName[edx]
mov ks.encryptFuncPtr,eax ;set up the function pointers
mov ks.decryptFuncPtr,ebx
if DO_COMPILE and DO_PATCH
mov eax,ks.subKeys[INPUT_WHITEN]
mov ebx,ks.subKeys[INPUT_WHITEN+4]
mov dword ptr Enc_CBC_SK_0_&cpuName[edx-4],eax
mov dword ptr Enc_CBC_SK_1_&cpuName[edx-4],ebx
mov dword ptr Dec_CBC_SK_0_&cpuName[edx-4],eax
mov dword ptr Dec_CBC_SK_1_&cpuName[edx-4],ebx
mov eax,ks.subKeys[INPUT_WHITEN+8]
mov ebx,ks.subKeys[INPUT_WHITEN+12]
mov dword ptr Enc_CBC_SK_2_&cpuName[edx-4],eax
mov dword ptr Enc_CBC_SK_3_&cpuName[edx-4],ebx
mov dword ptr Dec_CBC_SK_2_&cpuName[edx-4],eax
mov dword ptr Dec_CBC_SK_3_&cpuName[edx-4],ebx
mov ecx,TOTAL_SUBKEYS-1 ;patch once for every subkey
; patch in the subkey immediate values
skPatchLp_&cpuName:
mov eax,skPatchList_&cpuName[8*ecx]
mov ebx,skPatchList_&cpuName[8*ecx+4]
mov edi,ks.subKeys[4*ecx]
dec ecx
mov [eax+edx],edi
mov [ebx+edx],edi
jns skPatchLp_&cpuName
if (cpuName eq PentiumPro)
; patch in Sbox offsets (Pentium Pro only) -- "relocate" code
mov esi,offset sboxPatchList_&cpuName
lea ecx,S32_0 ;start address for Sbox
mov eax,[esi] ;ptr to A patch point
cmp ecx,[eax+edx] ;already relocated?
jz sboxPatchDone_&cpuName ;if so, don't need to re-do it
sboxPatchLp_&cpuName:
mov eax,[esi] ;ptr to A patch point
mov ebx,[esi+4] ;ptr to B patch point
add eax,edx
mov edi,[esi+8] ;round "bump"
add ebx,edx
rept MAX_ROUNDS-1
mov [eax],ecx ;patch A and B
add eax,edi
mov [ebx],ecx
add ebx,edi
endm
mov [eax],ecx ;do the final patch
mov [ebx],ecx
mov eax,[esi+12] ;adjust the sbox ptr
add esi,16 ;go to next table entry
add ecx,eax ;bump Sbox ptr
test eax,1 ;stop on odd bump value
jz sboxPatchLp_&cpuName
sboxPatchDone_&cpuName:
endif ;cpuName
endif ;DO_COMPILE
endif ;KM_COMPILE
endif ;!KM_ZERO
reKeyDone_&cpuName:
add esp,localSize2
popad
mov eax,1 ;success
ret
;
; Input: tmpSbox permutation (at _S_, due to call return address)
; esi --> q0 or q1, permutation through which to pass tmpSbox
; ebx = key byte to xor into data after q0/q1
; Output: tmpSbox updated
; edi,ebp unmodified
permXor_&cpuName:
ifdif <cpuName>,<PentiumPro>
push edi ;save edi on stack so we don't modify it
mov bh,bl
push ebp
mov ebp,ebx
shl ebx,16
xor eax,eax
mov edi,256-8
mov al,_S_[256-1]
or ebp,ebx ;ebp=four bytes replicated from original ebx
xor ebx,ebx
mov bl,_S_[256-1-4]
pxLoop_Pentium:
irp NN,<0>
mov ch,[esi+eax]
mov al,_S_[edi+6+NN]
mov dh,[esi+ebx]
mov bl,_S_[edi+2+NN]
mov cl,[esi+eax]
mov al,_S_[edi+5+NN]
shl ecx,16
mov dl,[esi+ebx]
shl edx,16
mov bl,_S_[edi+1+NN]
mov ch,[esi+eax]
mov al,_S_[edi+4+NN]
mov dh,[esi+ebx]
mov bl,_S_[edi+NN]
mov cl,[esi+eax]
mov al,_S_[edi-1+NN]
mov dl,[esi+ebx]
mov bl,_S_[edi-5+NN]
xor ecx,ebp
xor edx,ebp
mov dword ptr _S_[edi+4+NN],ecx
mov dword ptr _S_[edi+NN],edx
endm
sub edi,8
jae pxLoop_Pentium
else
push edi
push ebp
mov ebp,ebx
mov edi,256-8
shl ebp,8
or ebp,ebx
mov ebx,ebp
shl ebp,16
mov ecx,dword ptr _S_[edi+4];pick up next four bytes of permutation
or ebp,ebx ;ebp=four bytes replicated from original ebx
mov eax,ecx
movzx edx,ch
pxLoop_PentiumPro:
irp ZZ,<4,0>
movzx ecx,cl
shr eax,16
movzx edx,byte ptr [esi+edx]
shl edx,8
movzx ecx,byte ptr [esi+ecx]
movzx ebx,ah
movzx ebx,byte ptr [esi+ebx]
movzx eax,al
xor ecx,ebp
shl ebx,24
movzx eax,byte ptr [esi+eax]
xor edx,ecx
shl eax,16
xor ebx,edx
mov ecx,dword ptr _S_[edi+ZZ-4] ;get ready for next loop
xor ebx,eax
mov eax,ecx
movzx edx,ch
mov dword ptr _S_[edi+ZZ],ebx
endm
sub edi,8
jae pxLoop_PentiumPro
endif
pop ebp
pop edi
ret
;
badKeyLen_&cpuName: ; will only come here if there is an invalid keyLen parameter
int 3
add esp,localSize2
popad
xor eax,eax
ret
if (KEY_MODE and KM_ZERO) eq 0
dd badKeyLen_&cpuName
mdsJmpTab_&cpuName dd badKeyLen_&cpuName
if BIG_TAB
dd sbox8Done_&cpuName
endif
dd mds128_&cpuName,mds192_&cpuName,mds256_&cpuName
endif
skJmpTab_&cpuName dd badKeyLen_&cpuName
dd sk128_&cpuName,sk192_&cpuName,sk256_&cpuName
reKeyCode_&cpuName = ($-reKey_Start_&cpuName)+reKeySharedSize
irp cby,<%(reKeyCode_&cpuName)>
%out --- ReKey assembly code size == cby bytes (cpuName)
endm
irp cby,<%(bigKeyCode_&cpuName)>
%out --- Big key assembly code size == cby bytes (cpuName)
endm
irp cby,<%(reKeyCode_&cpuName-bigKeyCode_&cpuName)>
%out --- 128 key assembly code size == cby bytes (cpuName)
endm
irp cby,<%($-asmStart_&cpuName+reKeySharedSize)>
%out --- TOTAL assembly code size == cby bytes (cpuName)
%out
endm
TwoFishReKey_&cpuName endp
endm ;reKeyProc
;
;__MACRO_DEF_DONE__ ;sign to LST2ASM that macro definitions are done
;
;
;-------------------- Pentium (and 486) optimized -------------
;
asmStart_Pentium:
cipherProc Pentium
reKeyProc Pentium
asmEnd_Pentium:
;
;----------------- Pentium Pro/Pentium II optimized ------------
;
align 4
asmStart_PentiumPro:
; db 9 dup (90h) ;optimize code alignment (empirical)
cipherProc PentiumPro
reKeyProc PentiumPro
asmEnd_PentiumPro:
;
;----------------- Select which routines to use ---------------
;
; useAsm: bit 0 --> enable encryption in ASM
; bit 1 --> enable decryption in ASM
; bit 2 --> enable reKey in ASM
; If any other bits are set (e.g., useAsm >= 8),
; the low three bits get discarded here (SHR), and
; the remaining value is used as a pseudo-return
; value from get_cpu_type, to allow command-line
; override of the cpu type:
; useAsm=8..47--> force Pentium code
; useASM > 47 --> force PentiumPro code
;
extrn get_cpu_type:near
extrn useAsm:byte
;
public TwoFishEncrypt_Pentium,TwoFishDecrypt_Pentium,TwoFishReKey_Pentium
public TwoFishEncrypt_PentiumPro,TwoFishDecrypt_PentiumPro,TwoFishReKey_PentiumPro
public TwofishAsmCodeSize
setProcPtrs proc
pushad
; set default (Pentium) addresses
mov blockEncrypt_86,offset TwoFishEncrypt_Pentium
mov blockDecrypt_86,offset TwoFishDecrypt_Pentium
mov reKey_86 ,offset TwoFishReKey_Pentium
call get_cpu_type
xor al,5
iff nz
mov al,1
endi
test useAsm,8 ;command line override of CPU type?
iff nz
xor al,1
endi
or al,al
iff nz
mov blockEncrypt_86,offset TwoFishEncrypt_PentiumPro
mov blockDecrypt_86,offset TwoFishDecrypt_PentiumPro
mov reKey_86 ,offset TwoFishReKey_PentiumPro
endi
popad
ret
setProcPtrs endp
E_SelectCPU proc ;only come here once
call setProcPtrs
jmp blockEncrypt_86 ;use the ptr we just set up
E_SelectCPU endp
;
D_SelectCPU proc
call setProcPtrs
jmp blockDecrypt_86
D_SelectCPU endp
R_SelectCPU proc
call setProcPtrs
jmp reKey_86
R_SelectCPU endp
; code size functions
TwofishAsmCodeSize proc
call setProcPtrs
mov eax,offset asmEnd_Pentium-asmStart_Pentium
cmp blockEncrypt_86,offset TwoFishEncrypt_PentiumPro
iff z
mov eax,offset asmEnd_PentiumPro-asmStart_PentiumPro
endi
add eax,reKeySharedSize
ret
TwofishAsmCodeSize endp
end
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -