📄 2fish_86.asm
字号:
endm
endif
TwoFishDecrypt_&cpuName endp
endm ;cipherProc
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;-------------------------- Key schedule -----------------------
;
varOffs = 0
alloc kLen64,dword,4 ;keyLen in bits/64
alloc reKeyJmpPtr,dword,4 ;key-size dependent function ptr
alloc SboxKey,dword,MAX_KEY_BITS/16 ;Sbox keys (RS code over k32o,k32o)
alloc tmp0,dword,4
alloc tmp1,dword,4
alloc tmp2,dword,4
alloc tmp3,dword,4
alloc localPool,dword,4*SUBKEY_SIZE+(MAX_KEY_BITS/8)+4
localSize2 = varOffs
alloc _regs,dword,8*4 ;pushad puts regs here
alloc _retAddr,dword,4 ;esp --> here on entry
alloc keyPtr,dword,4 ;pointer to key schedule to initialize
;
; localPool usage during subkey generation
lKey32 equ <localPool> ;local copy of the key bits
lSubkey equ <lKey32[MAX_KEY_BITS/8]>;local copy of subkey material
; localPool usage during S-box generation
pPtr equ <localPool>
kPtr equ <localPool+4>
tmpSbox equ <localPool+8>
tmpSbox8 equ <byte ptr tmpSbox>
;
;----------------------------------------------------------------
; RSrem
; Input: edx = dword to be shifted 4 times
; Output: edx = dword shifted
; ecx,edi unmodified
; Note: Speed is not a very significant issue here.
; If it were, an 8x32 feedback table could be built.
;
; g(x) = x^4 + A4 x^3 + 02 x^2 + A4 x + 1 ;primitive polynomial = 14D
;
; This RSrem code runs equally well on a Pentium or Pentium Pro. It is a little
; larger than Pentium-specific code (40 bytes), but not enough to worry about.
;
RSrem proc
push ecx ;do not modify ecx here
mov ecx,4
remLoop:
rol edx,8 ;put new byte into place
mov eax,0FEH
mov esi,1 ;get ready to mask from new byte
mov ebx,0FFH
and eax,edx
and esi,edx ;esi=bit 0 of feedback byte
and ebx,edx ;ignore all but the important bits
shl eax,23
xor esi,1
xor ebx,80h
xor edx,eax
dec esi ;make esi = all zeroes or all ones
sub ebx,80h ;set upper 25 bits of ebx
shr eax,16
and esi,0A600A600H ;mask the position
add ebx,ebx ;shift ebx into position
xor edx,eax
xor edx,esi
mov esi,ebx
shl esi,24
and ebx,4D4D4D00H
xor edx,ebx
mov ebx,esi
shr esi,8
xor edx,ebx
shr ebx,16
xor edx,esi
xor edx,ebx
dec ecx
jnz remLoop
pop ecx
ret
RSrem endp
mdsOffsTab dd 0,4,2*1024,2*1024+4
; LSB MSB
pTab dd P_01,P_11,P_21,P_31 ;"last" stage (previous to MDSmat)
dd P_02,P_12,P_22,P_32
dd P_03,P_13,P_23,P_33
dd P_04,P_14,P_24,P_34
pPtrTab dd pTab,pTab+16,pTab+32,pTab+48
if KEY_MODE AND (KM_PART or KM_MIN)
x8Tab dd 0,100h,200h,300h
endif
;
reKeySharedSize = $-RSrem
ldCache macro addr,byteCnt,cpuName
ifdif <cpuName>,<PentiumPro>
NN=0
rept (byteCnt+63)/64 ;force cache line load (Pentium only)
irp QQ,<%(NN)>
mov eax,addr[QQ]
endm
if (NN+32) lt byteCnt
irp QQ,<%(NN+36)>
mov ebx,addr[QQ]
endm
endif
NN=NN+64
endm
endif
endm
pXor8 macro dst,src,reg,N
mov eax,src+N
mov ebx,src+N+4
xor eax,reg
xor ebx,reg
mov dst+N,eax
mov dst+N+4,ebx
endm
ld8 macro dstReg,src,cpuName ;;__TRANSPARENT__
ifdif <cpuName>,<PentiumPro>
mov dstReg&l,byte ptr src
else
movzx e&dstReg&x,byte ptr src
endif
endm
skLdKey macro keySize,N,cpuName ;;__TRANSPARENT__
ifdif <cpuName>,<PentiumPro>
mov cl,byte ptr lKey32[N]
mov dl,byte ptr lKey32[N+8]
if keySize eq 128
mov ebp,ecx
mov esi,edx ;load esi/ebp with key bytes
elseif keySize eq 192
mov tmp1,ecx ;save these
mov cl,byte ptr lKey32[N+16]
mov ebp,edx
mov tmp3,ecx
mov esi,ecx
else ;keySize eq 256
mov tmp0,ecx ;save copy on stack
mov cl,byte ptr lKey32[N+16]
mov tmp1,edx
mov dl,byte ptr lKey32[N+24]
mov tmp2,ecx
mov ebp,ecx
mov tmp3,edx
mov esi,edx
endif
else
if keySize eq 128
movzx ebp,byte ptr lKey32[N] ;load esi/ebp with key bytes
movzx esi,byte ptr lKey32[N+8]
elseif keySize eq 192
movzx esi,byte ptr lKey32[N]
movzx ebp,byte ptr lKey32[N+8]
mov tmp1,esi
movzx esi,byte ptr lKey32[N+16]
mov tmp3,esi
else ;keySize eq 256
movzx ebp,byte ptr lKey32[N] ;load esi/ebp with key bytes
movzx esi,byte ptr lKey32[N+8]
mov tmp0,ebp
mov tmp1,esi
movzx ebp,byte ptr lKey32[N+16]
movzx esi,byte ptr lKey32[N+24]
mov tmp2,ebp
mov tmp3,esi
endif
endif
endm
;
;
DO_CALL_Pentium = 0 ;inline code, or use calls?
DO_CALL_PentiumPro = 0
irp cpuName,<Pentium,PentiumPro>
if DO_CALL_&cpuName
irp XX,<0,1,2,3>
concat <public ASM_ALIGN_>,%(XX),<_>,cpuName
endm
endif
endm
jmpRet macro cpuName
jmp reKeyDone_&cpuName ;comment this line to do full testing
endm
;
; precompute first step in 32-bit chunks?? Nope -- only saves ~ 40 clocks
;
; Input: edi = i
; esi = byte to xor with s0[] output
; ebp = byte to xor with s1[] output
;
subKey8x32Proc macro keySize,N,cpuName,s0,s1,s2,s3
if DO_CALL_&cpuName
align 4
LSK_OFFS = 4
else
LSK_OFFS = 0
endif
skProc_&keySize&_&N&_&cpuName:
xor ecx,ecx ;keep Pentium pairing properly aligned
ld8 <a>,s0[edi],cpuName ;run through first 8x8 permutation
xor eax,esi ;xor with first key byte
ld8 <b>,s0[edi+2],cpuName
xor ebx,esi
ld8 <c>,s0[edi+4],cpuName
xor ecx,esi
ld8 <d>,s0[edi+6],cpuName
xor edx,esi
ld8 <a>,s1[eax],cpuName ;run through second 8x8
xor eax,ebp ;xor with second key byte
ld8 <b>,s1[ebx],cpuName
xor ebx,ebp
ld8 <c>,s1[ecx],cpuName
xor ecx,ebp
ld8 <d>,s1[edx],cpuName
ifnb <s2>
xor edx,ebp
ld8 <a>,s2[eax],cpuName ;run through 3rd 8x8
mov esi,tmp1+4*DO_CALL_&cpuName
if keySize gt 192
mov ebp,tmp0+4*DO_CALL_&cpuName
endif
xor eax,esi ;xor with 3rd key byte
ld8 <b>,s2[ebx],cpuName
xor ebx,esi
ld8 <c>,s2[ecx],cpuName
xor ecx,esi
ld8 <d>,s2[edx],cpuName
xor edx,esi
ifnb <s3>
ld8 <a>,s3[eax],cpuName ;run through 4th 8x8
xor eax,ebp ;xor with 4th key byte
ld8 <b>,s3[ebx],cpuName
xor ebx,ebp
ld8 <c>,s3[ecx],cpuName
xor ecx,ebp
ld8 <d>,s3[edx],cpuName
xor edx,ebp
endif
mov eax,MDStab[4*eax+N*1024];do final 8x8 and MDS multiply lookup
mov esi,tmp3+4*DO_CALL_&cpuName ;reload old key bytes
if keySize gt 192
mov ebp,tmp2+4*DO_CALL_&cpuName ;(for next time)
endif
else
xor edx,ebp
mov eax,MDStab[4*eax+N*1024];do final 8x8 and MDS multiply lookup
endif
concat < mov lSubkey[4*edi+N*SUBKEY_SIZE][>,%LSK_OFFS,<],eax>
mov eax,MDStab[4*ebx+N*1024]
concat < mov lSubkey[4*edi+N*SUBKEY_SIZE+8][>,%LSK_OFFS,<],eax>
mov eax,MDStab[4*ecx+N*1024]
concat < mov lSubkey[4*edi+N*SUBKEY_SIZE+16][>,%LSK_OFFS,<],eax>
mov eax,MDStab[4*edx+N*1024]
concat < mov lSubkey[4*edi+N*SUBKEY_SIZE+24][>,%LSK_OFFS,<],eax>
xor eax,eax
sub edi,8
jae skProc_&keySize&_&N&_&cpuName
if DO_CALL_&cpuName
ret 0
db ((1 + (15-(($-skProc_&N&_&cpuName) AND 15))) AND 15) dup (0)
endif
endm
skProc macro keySize,N,cpuName ;;__TRANSPARENT__
if keySize eq 128
subKey8x32Proc keySize,N,cpuName,P_&N&2,P_&N&1
elseif keySize eq 192
subKey8x32Proc keySize,N,cpuName,P_&N&3,P_&N&2,P_&N&1
elseif keySize eq 256
subKey8x32Proc keySize,N,cpuName,P_&N&4,P_&N&3,P_&N&2,P_&N&1
else
Invalid_KeySize_Error;;
endif
endm
skEvenOdd macro keySize,N,cpuName
if N eq 0
mov edi,TOTAL_SUBKEYS-8 ;do the even ones first (backwards)
else
add edi,TOTAL_SUBKEYS-1
endif
skLdKey keySize,N,cpuName
if DO_CALL_&cpuName
call skProc_&keySize&_&N&_&cpuName
add edi,TOTAL_SUBKEYS+1 ;go back for the odd ones
skLdKey keySize,N+4,cpuName
call skProc_&keySize&_&N&_&cpuName
else
skProc keySize,N,cpuName ;instantiate the code inline
test edi,1 ;and loop twice for even/odd subkeys
jnz short sk&keySize&_&N&_done_&cpuName
add edi,TOTAL_SUBKEYS+1 ;go back for the odd ones
skLdKey keySize,N+4,cpuName
jmp skProc_&keySize&_&N&_&cpuName
align 4
sk&keySize&_&N&_done_&cpuName:
endif
endm
;
_S_ equ <tmpSbox8[12]> ;use this to reference tmpSbox
;
; int reKey(keyInstance *keyPtr); // build the key schedule
;
reKeyProc macro cpuName
reKey_Start_&cpuName:
if DO_CALL_&cpuName
db 1 dup (0) ;align
irp kSize,<256,192,128>
ASM_ALIGN_&kSize&_0_&cpuName: skProc %kSize,0,cpuName
ASM_ALIGN_&kSize&_1_&cpuName: skProc %kSize,1,cpuName
ASM_ALIGN_&kSize&_2_&cpuName: skProc %kSize,2,cpuName
ASM_ALIGN_&kSize&_3_&cpuName: skProc %kSize,3,cpuName
endm
bigKeyCode_&cpuName = (ASM_ALIGN_128_0_&cpuName-ASM_ALIGN_256_0_&cpuName)
endif
;
TwoFishReKey_&cpuName proc
pushad
mov ebp,keyPtr-localSize2
sub esp,localSize2
xor edi,edi
biasEBP
mov ecx,ks.keyLen ;should be 128, 192, or 256
shr ecx,6 ;divide by 64
ldCache [esp],localSize2,cpuName
ldCache [ks.key32],%(fullSbox-key32),cpuName
; jmpRet cpuName ;; ??? to here (MMX: 160)(Pro: 160) ["Nothing"]
;copy over key material, do the Reed-Solomon thing
mov kLen64,ecx ;store key length
rsLoop_&cpuName:
mov edx,ks.key32[8*edi+4]
mov lKey32[8*edi+4],edx ;build a local copy of key bytes
call RSrem ;process the first four bytes
mov eax,ks.key32[8*edi] ;get next two key material dwords
mov lKey32[8*edi],eax ;(local copy)
inc edi ;bump ptr
xor edx,eax
call RSrem ;now process the final four
mov SboxKey[4*ecx-4],edx ;store result in reverse order
mov ks.sboxKeys[4*ecx-4],edx
if KEY_MODE and KM_ZERO
ifidn <cpuName>,<PentiumPro>
lea esi,[4*ecx-4]
movzx eax,dl ;expand Pentium Pro keys: dword --> four bytes
movzx ebx,dh
mov sboxK8[4*esi],eax
shr edx,16
mov sboxK8[4*esi+4],ebx
movzx eax,dh
mov sboxK8[4*esi+12],eax
movzx edx,dl
mov sboxK8[4*esi+8],edx
endif
endif
dec ecx
jg rsLoop_&cpuName
mov esi,skJmpTab_&cpuName[4*edi-4]
; compute all the subkey s-box results
xor edi,edi ;edi = i in subKey loop
xor eax,eax
xor ebx,ebx
xor ecx,ecx
xor edx,edx
; jmpRet cpuName ;; 487 to here (MMX: 451)(Pro: 315) ["RSrem"]
jmp esi
irp kBits,<256,192,128>
concat <sk>,%(kBits),<_>,cpuName:
skEvenOdd %(kBits),0,cpuName
skEvenOdd %(kBits),1,cpuName
skEvenOdd %(kBits),2,cpuName
skEvenOdd %(kBits),3,cpuName
if (kBits ne 128)
jmp sk8Done_&cpuName ;put 128-bit key last to fall through!
endif
endm
ife DO_CALL_&cpuName ;show how big the 192/256 bit code is
bigKeyCode_&cpuName = (sk128_&cpuName-sk256_&cpuName)
endif
sk8Done_&cpuName:
; jmpRet cpuName ;; 1850 to here (MMX:1100)(Pro: 959) ["4skEvenOdd"]
add edi,TOTAL_SUBKEYS-1+6 ;build in reverse order
mov ebp,keyPtr ;put final stuff here
biasEBP
; now run subkey s-box bytes through MDS and rotate/PHT/combine
subkeyLp_&cpuName:
mov eax,lSubkey[4*edi]
mov ebx,lSubkey[4*edi+4]
mov ecx,lSubkey[4*edi+SUBKEY_SIZE]
mov edx,lSubkey[4*edi+SUBKEY_SIZE+4]
xor eax,ecx
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -