📄 2fish_86.asm
字号:
page 60,160
title Twofish for 386+, Author: Doug Whiting, Hi/fn
;
; WARNING: This module is written for speed, not clarity!
;
.386
.MODEL FLAT, C
public blockEncrypt_86,blockDecrypt_86,reKey_86
.DATA
blockEncrypt_86 dd E_SelectCPU ;first time thru, use get_cpu_type
blockDecrypt_86 dd D_SelectCPU
reKey_86 dd R_SelectCPU
extrn MDStab:dword ;MDS multiply matrix (pre-permuted)
extrn P8x8:byte ;two fixed 8x8 permutations
q0 equ P8x8[0]
q1 equ P8x8[256]
MOVS_MASK equ 15 ;optimized Pentium Pro movsd alignment
Pentium equ 586 ;defines for cpuName
PentiumPro equ 686
DO_COMPILE equ 1 ;for debugging performance
DO_PATCH equ 1
.CODE
;
; ML syntax:
; ml.exe -Flt2fish_86.lst -coff -Cx -Zi -Zm -c -DMASM6 2fish_86.asm
;----------------------------------------------------------------------------------
; Useful general macros
;----------------------------------------------------------------------------------
OFFSET32 equ 1 ;force 32-bit strucmac stuff
.xlist ;don't expand strucmac defns
include strucmac.inc
.list
BIG_TAB = 0
KM_ZERO = 1 ;KEY_MODE bits
KM_MIN = 2
KM_PART = 4
KM_FULL = 8
KM_COMPILE = 16
ifdef PART_KEY
%out Assembly with PART_KEY
KEY_MODE = KM_PART
elseifdef MIN_KEY
%out Assembly with MIN_KEY
KEY_MODE = KM_MIN
BIG_TAB = 0 ;can't use big table in min key mode
elseifdef ZERO_KEY
%out Assembly with ZERO_KEY
KEY_MODE = KM_ZERO
sboxK8 equ <ks.fullSbox> ;use S-box to expand keys for PentiumPro
elseifdef COMPILE_KEY
KEY_MODE = KM_COMPILE
else ;default is full key
%out Assembly with FULL_KEY
KEY_MODE = KM_FULL
endif
;
; where the permutations are used
;
P_00 equ q1 ;"outermost" permutation (in MDSmat)
P_01 equ q0
P_02 equ q0
P_03 equ q1
P_04 equ q1
P_10 equ q0
P_11 equ q0
P_12 equ q1
P_13 equ q1
P_14 equ q0
P_20 equ q1
P_21 equ q1
P_22 equ q0
P_23 equ q0
P_24 equ q0
P_30 equ q0
P_31 equ q1
P_32 equ q1
P_33 equ q0
P_34 equ q1
;
; useful in "splicing" names together
concat macro aa,bb,cc,dd,ee,ff,gg,hh,ii,jj,kk ;;__TRANSPARENT__ (signal to LST2ASM)
aa&bb&cc&dd&ee&ff&gg&hh&ii&jj&kk
endm
alloc macro varName,varType,varSize ;macro to help define parms/locals
irp XX,<%varOffs>
varName equ varType ptr [esp+XX]
endm
varOffs = varOffs+varSize;
endm
; copy from src to dst (use eax,ebx)
Copy8 macro dst,src
ifnb <src>
mov eax,src ;load source dwords(if needed)
mov ebx,src+4
endif
mov dst,eax ;store new dst dwords
mov dst+4,ebx
endm
;
;dst = xorA ^ xorb ^ xorC (8 bytes, xorC may be blank)
;Returns two dst dwords in reg0,reg1. Trashes ecx,edx
;oldDst = where to copy current dst value before overwriting (for CBC mode)
Xor8 macro lbl,cpuName,reg0,reg1,dst,xorA,subKeyIndex,xorC,oldDst
ifnb <xorA>
mov reg0,xorA ;load both A dwords (unless already loaded)
mov reg1,xorA+4
endif
if (KEY_MODE and KM_COMPILE) eq 0
mov ecx,ks.subKeys[subKeyIndex] ;load both B dwords
mov edx,ks.subKeys[subKeyIndex+4]
xor reg0,ecx ;compute A ^ B
xor reg1,edx
else
xor reg0,12345678h ;do an xor with immediate values
concat lbl,_SK_,%((subKeyIndex)/4),<_>,cpuName,< label dword>
xor reg1,12345678h
concat lbl,_SK_,%((subKeyIndex)/4+1),<_>,cpuName,< label dword>
endif
ifnb <xorC>
if cpuName eq Pentium
mov ecx,xorC ;load both C dwords
mov edx,xorC+4
xor reg0,ecx ;compute A ^ B ^ C
xor reg1,edx
else
xor reg0,xorC
xor reg1,xorC+4
endif
endif
ifnb <dst>
ifnb <oldDst>
if (cpuName ne Pentium) and (KEY_MODE and KM_COMPILE)
mov ebp,dst
mov oldDst,ebp
mov ebp,dst+4
mov oldDst+4,ebp
else
mov ecx,dst ;pick up previous ciphertext value
mov edx,dst+4
mov oldDst,ecx ;and make copy (for next IV)
mov oldDst+4,edx
endif
endif
mov dst,reg0 ;save the result
mov dst+4,reg1 ;(and return in eax,ebx)
endif
endm
;
;----------------------------------------------------------------------------------
; Definitions, Structures (AES.H)
;----------------------------------------------------------------------------------
;
BLOCK_SIZE equ 128
MAX_KEY_BITS equ 256
MAX_ROUNDS equ 16
INPUT_WHITEN equ 0
OUTPUT_WHITEN equ (BLOCK_SIZE/8)
ROUND_SUBKEYS equ (OUTPUT_WHITEN+BLOCK_SIZE/8)
TOTAL_SUBKEYS equ ((ROUND_SUBKEYS/4)+2*MAX_ROUNDS)
SUBKEY_SIZE equ <4*TOTAL_SUBKEYS>
DIR_ENCRYPT equ 0
DIR_DECRYPT equ 1
MODE_ECB equ 1
MODE_CBC equ 2
MODE_CFB1 equ 3
if KEY_MODE and KM_COMPILE
VALID_SIG equ 504D4F43h ;'COMP'
CSIG_1 = VALID_SIG + 12345678H
CSIG_2 = VALID_SIG XOR -1
else
VALID_SIG equ 48534946h ;'FISH'
endif
cipherInstance struc
mode db 4 dup (?) ;MODE_ECB, MODE_CBC (Let C code handle MODE_CFB1)
CFB1_IV db BLOCK_SIZE/8 dup (?);CFB1 IV bytes
cipherSig dd ? ;should be VALID_SIG
IV32 dd BLOCK_SIZE/32 dup (?);CBC IV dwords
cipherInstance ends
keyInstance struc
direction db 4 dup (?) ;DIR_ENCRYPT or DIR_DECRYPT
keyLen dd ? ;length of the key in bits
keyMaterial db 68 dup (?) ;ASCII key material
keySig dd ? ;should be VALID_SIG
numRounds dd ? ;should be 16
key32 dd MAX_KEY_BITS/32 dup (?)
sboxKeys dd MAX_KEY_BITS/64 dup (?)
subKeys dd TOTAL_SUBKEYS dup (?)
fullSbox dd 4*256 dup (?) ;S-box plus MDS
if KEY_MODE and KM_COMPILE
cSig1 dd ? ;show whether we're compiled
encryptFuncPtr dd ? ;ptr to encrypt function
decryptFuncPtr dd ? ;ptr to encrypt function
codeSize dd ? ;sizeof cipherProcCode
cSig2 dd ?
cipherProcCode dd (4600/4) dup (?) ;compiled code itself
endif
keyInstance ends
;
; To minimize code space, bias ebp to generate as many 8-bit offsets as possible
;
BIAS_VAL = (fullSbox-(TOTAL_SUBKEYS*2))
biasEBP equ <add ebp,BIAS_VAL>
ks equ <[ebp-BIAS_VAL]>
;
;----------------------------------------------------------------------------------
; KeyMode-dependent definitions
;----------------------------------------------------------------------------------
SBOX_SIZE equ 1024
if KEY_MODE and (KM_FULL or KM_COMPILE)
Sbump0 equ 0
Sbump1 equ 4
Sbump2 equ (2*SBOX_SIZE)
Sbump3 equ (2*SBOX_SIZE+4)
S32_0 equ <ks.fullSbox[Sbump0]>
S32_1 equ <ks.fullSbox[Sbump1]>
S32_2 equ <ks.fullSbox[Sbump2]>
S32_3 equ <ks.fullSbox[Sbump3]>
SBS = 8
EDX_ADJUST = 100h ;optimize for Pentium code size!
else
S32_0 equ <MDStab>
S32_1 equ <MDStab[SBOX_SIZE]>
S32_2 equ <MDStab[SBOX_SIZE*2]>
S32_3 equ <MDStab[SBOX_SIZE*3]>
S8_0 equ <byte ptr ks.fullSbox>
S8_1 equ <byte ptr ks.fullSbox[100h]>
S8_2 equ <byte ptr ks.fullSbox[200h]>
S8_3 equ <byte ptr ks.fullSbox[300h]>
SBS = 4
EDX_ADJUST = 300h ;optimize for Pentium code size!
endif
EAX_ADJUST = 0 ;these defns make handling EDX_ADJUST easier
EBX_ADJUST = 0
ECX_ADJUST = 0
;
;----------------------------------------------------------------------------------
; Macros for dealing with non-full keying S-boxes
;----------------------------------------------------------------------------------
;
if (KEY_MODE and (KM_FULL or KM_COMPILE)) eq 0
lookupS8 macro cpuName,R,i ;__TRANSPARENT__
ifnb <R>
ifdif <cpuName>,<PentiumPro>
mov R&L,S8_&i[E&R&X-E&R&X_ADJUST]
else
movzx R,S8_&i[R]
endif
endif
endm
keyXor8 macro cpuName,R,I ;__TRANSPARENT__
ifnb <R>
ifdif <cpuName>,<PentiumPro>
xor R&L,byte ptr ks.sboxKeys[I]
else
xor R,sboxK8[4*I] ;the byte is expanded to 32 bits here
endif
endif
endm
lookupQ macro cpuName,R,i,j ;__TRANSPARENT__
ifnb <R>
ifdif <cpuName>,<PentiumPro>
mov R&L,P_&i&j[E&R&X-E&R&X_ADJUST]
else
movzx R,P_&i&j[R]
endif
endif
endm
doS4 macro cpuName,W,R1,R2,i1,i2,R3,R4,i3,i4 ;__TRANSPARENT__
lookupQ cpuName,R1,i1,%(W)
lookupQ cpuName,R2,i2,%(W)
lookupQ cpuName,R3,i3,%(W)
lookupQ cpuName,R4,i4,%(W)
keyXor8 cpuName,R1,%(i1+4*W-4)
keyXor8 cpuName,R2,%(i2+4*W-4)
keyXor8 cpuName,R3,%(i3+4*W-4)
keyXor8 cpuName,R4,%(i4+4*W-4)
endm
; this macro is a nop for full keying
doSbox macro keySize,cpuName,R1,R2,i1,i2,R3,R4,i3,i4
if KEY_MODE and (KM_PART or KM_MIN) ;use 8-bit precomputed S-box
lookupS8 cpuName,R1,i1
lookupS8 cpuName,R2,i2
lookupS8 cpuName,R3,i3
lookupS8 cpuName,R4,i4
elseif KEY_MODE and KM_ZERO
if keySize gt 128
if keySize gt 192
doS4 cpuName,4,R1,R2,i1,i2,R3,R4,i3,i4
endif
doS4 cpuName,3,R1,R2,i1,i2,R3,R4,i3,i4
endif
doS4 cpuName,2,R1,R2,i1,i2,R3,R4,i3,i4
endif
if KEY_MODE and (KM_MIN or KM_ZERO) ;run bytes thru "next-to-last" 8-bit permutation, key xor
doS4 cpuName,1,R1,R2,i1,i2,R3,R4,i3,i4
endif
endm
endif ;!(KM_FULL or KM_COMPILE)
GetSubkey macro lbl,dstReg,skOffs ;__TRANSPARENT__
if KEY_MODE and KM_ZERO
mov dstReg,roundNum
mov dstReg,ks.subKeys[8*dstReg+8*4+skOffs]
else
mov dstReg,ks.subKeys[skOffs]
endif
endm
;
;----------------------------------------------------------------------------------
; Round function macros
;----------------------------------------------------------------------------------
;
; Compiled mode code for Pentium Pro/II
; Be *very* careful about re-ordering opcodes! This ordering seems to perform best...
;
if KEY_MODE and KM_COMPILE
; do a load/xor from Sbox, put label for Sbox patching
sboxOp macro lbl,cpuName,R,opCode,dstReg,idxReg,sboxNum,a_b
opCode dstReg,MDStab[SBS*idxReg+Sbump&sboxNum]
concat lbl,_Sbox,%(sboxNum),<_>,a_b,<_R>,%(R),<_>,cpuName,< label dword>
endm
;
;
RF_PPro macro lbl,cpuName,r0,r1,r2,r3,R,S_INDEX
ifdif <lbl>,<Enc>
;================ decryption code ========================
; if R eq MAX_ROUNDS-1
; rol e&r2&x,1 ;initial rotation (do it during whitening)
; endif
movzx esi,r1&l
movzx ebp,r1&h
if R ne MAX_ROUNDS-1
xor e&r0&x,edi
endif
movzx edi,r0&l
sboxOp lbl,cpuName,R,mov,esi,esi,1,b
sboxOp lbl,cpuName,R,xor,esi,ebp,2,b
ror e&r1&x,16
sboxOp lbl,cpuName,R,mov,edi,edi,0,a
movzx ebp,r0&h
sboxOp lbl,cpuName,R,xor,edi,ebp,1,a
movzx ebp,r1&l
ror e&r0&x,16
sboxOp lbl,cpuName,R,xor,esi,ebp,3,b
movzx ebp,r0&l
sboxOp lbl,cpuName,R,xor,edi,ebp,2,a
movzx ebp,r1&h
ror e&r1&x,16
sboxOp lbl,cpuName,R,xor,esi,ebp,0,b
movzx ebp,r0&h
if R eq 0
ror e&r0&x,16
else
ror e&r0&x,15 ;pre-rotate for next round
endif
sboxOp lbl,cpuName,R,xor,edi,ebp,3,a
lea ebp,[edi+2*esi+12345]
concat lbl,_SK_,%((S_INDEX+4)/4),<_>,cpuName,< label dword>
lea edi,[edi+esi+12345]
concat lbl,_SK_,%((S_INDEX)/4),<_>,cpuName,< label dword>
xor e&r3&x,ebp
ror e&r3&x,1
if R eq 0
xor e&r2&x,edi
endif
else;================ encryption code ========================
if R eq 0
rol e&r3&x,1 ;do the initial rotate
endif
movzx esi,r0&l
movzx ebp,r0&h
if R ne 0
xor e&r1&x,edi ;complete the Feistel xor from last round
endif
movzx edi,r1&l
sboxOp lbl,cpuName,R,mov,esi,esi,0,a
sboxOp lbl,cpuName,R,xor,esi,ebp,1,a
ror e&r0&x,16
sboxOp lbl,cpuName,R,mov,edi,edi,1,b
movzx ebp,r1&h
sboxOp lbl,cpuName,R,xor,edi,ebp,2,b
movzx ebp,r0&l
ror e&r1&x,16
sboxOp lbl,cpuName,R,xor,esi,ebp,2,a
movzx ebp,r1&l
sboxOp lbl,cpuName,R,xor,edi,ebp,3,b
movzx ebp,r0&h
ror e&r0&x,16
sboxOp lbl,cpuName,R,xor,esi,ebp,3,a
movzx ebp,r1&h
if R eq MAX_ROUNDS-1
ror e&r1&x,16
else
ror e&r1&x,15 ;pre-rotate for next round
endif
sboxOp lbl,cpuName,R,xor,edi,ebp,0,b ;could do load, then xor...
lea ebp,[esi+edi+12345]
concat lbl,_SK_,%((S_INDEX)/4),<_>,cpuName,< label dword>
lea edi,[esi+2*edi+12345]
concat lbl,_SK_,%((S_INDEX+4)/4),<_>,cpuName,< label dword>
xor e&r2&x,ebp
ror e&r2&x,1
if R eq MAX_ROUNDS-1
xor e&r3&x,edi
endif
endif ; lbl
endm ; RF_PPro
endif ; KEY_MODE and KM_COMPILE
RoundFunc macro keySize,lbl,cpuName,z0,z1,z2,z3,R,S_INDEX
concat lbl,keySize,Round_,%(R+1),<_>,cpuName,<:> ;keep the listing easy to follow
ifdif <cpuName>,<PentiumPro>
mov z0,eax ;save previous round Feistel results (not on first round)
mov z1,ebx
mov cl,ah ;set up to access Sbox
mov dl,bh
and eax,0FFH
and ebx,0FFH
if (KEY_MODE and (KM_FULL or KM_COMPILE)) eq 0
doSbox keySize,cpuName,C,D,1,2,A,B,0,1
endif
mov esi,S32_1[SBS*ecx] ;"interleave" even/odd Sboxes --> no cache bank problems
mov edi,S32_2[SBS*edx-SBS*EDX_ADJUST]
mov cl,byte ptr z0[2]
mov dl,byte ptr z1[2]
mov eax,S32_0[SBS*eax]
mov ebx,S32_1[SBS*ebx]
if (KEY_MODE and (KM_FULL or KM_COMPILE)) eq 0
doSbox keySize,cpuName,C,D,2,3
endif
xor esi,eax
xor edi,ebx
mov eax,S32_2[SBS*ecx]
mov ebx,S32_3[SBS*edx-SBS*EDX_ADJUST]
mov dl,byte ptr z0[3]
mov cl,byte ptr z1[3]
xor esi,eax
xor edi,ebx
if (KEY_MODE and (KM_FULL or KM_COMPILE)) eq 0
doSbox keySize,cpuName,C,D,0,3
endif
mov ebx,S32_0[SBS*ecx]
mov eax,S32_3[SBS*edx-SBS*EDX_ADJUST]
xor esi,eax
xor edi,ebx
if (KEY_MODE and KM_COMPILE) eq 0
add esi,edi ;first half of the PHT
GetSubkey lbl,eax,S_INDEX
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -