2fish_86.asm

来自「一个towfish加密算法的源代码」· 汇编代码 · 共 1,769 行 · 第 1/4 页
ASM
1,769 行
  endm
endif
TwoFishDecrypt_&cpuName endp
endm ;cipherProc 
;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 
;
;-------------------------- Key schedule ----------------------- 
;
varOffs = 0 
	alloc	kLen64,dword,4					;keyLen in bits/64
	alloc	reKeyJmpPtr,dword,4				;key-size dependent function ptr
	alloc	SboxKey,dword,MAX_KEY_BITS/16	;Sbox keys (RS code over k32o,k32o) 
	alloc	tmp0,dword,4
	alloc	tmp1,dword,4
	alloc	tmp2,dword,4
	alloc	tmp3,dword,4
	alloc	localPool,dword,4*SUBKEY_SIZE+(MAX_KEY_BITS/8)+4
	localSize2 = varOffs 
	alloc	_regs,dword,8*4					;pushad puts regs here 
	alloc	_retAddr,dword,4				;esp --> here on entry 
	alloc	keyPtr,dword,4					;pointer to key schedule to initialize 
;
; localPool usage during subkey generation
lKey32		equ		<localPool>				;local copy of the key bits
lSubkey		equ		<lKey32[MAX_KEY_BITS/8]>;local copy of subkey material
; localPool usage during S-box generation
pPtr		equ		<localPool>
kPtr		equ		<localPool+4>
tmpSbox		equ		<localPool+8>
tmpSbox8	equ		<byte ptr tmpSbox>
;
;----------------------------------------------------------------
; RSrem
; Input:	edx	=	dword to be shifted 4 times
; Output:	edx =	dword shifted
;			ecx,edi unmodified
; Note:		Speed is not a very significant issue here. 
;			If it were, an 8x32 feedback table could be built.
;
;	g(x) = x^4 + A4 x^3 + 02 x^2 + A4 x + 1		;primitive polynomial = 14D
;
; This RSrem code runs equally well on a Pentium or Pentium Pro.  It is a little
; larger than Pentium-specific code (40 bytes), but not enough to worry about.
;
RSrem	proc
	push	ecx					;do not modify ecx here
	mov		ecx,4
remLoop:
	rol		edx,8				;put new byte into place

	mov		eax,0FEH
	mov		esi,1				;get ready to mask from new byte
	mov		ebx,0FFH

	and		eax,edx
	and		esi,edx				;esi=bit 0 of feedback byte
	and		ebx,edx				;ignore all but the important bits

	shl		eax,23
	xor		esi,1
	xor		ebx,80h

	xor		edx,eax
	dec		esi					;make esi = all zeroes or all ones
	sub		ebx,80h				;set upper 25 bits of ebx

	shr		eax,16
	and		esi,0A600A600H		;mask the position
	add		ebx,ebx				;shift ebx into position

	xor		edx,eax
	xor		edx,esi
	mov		esi,ebx

	shl		esi,24
	and		ebx,4D4D4D00H

	xor		edx,ebx
	mov		ebx,esi

	shr		esi,8
	xor		edx,ebx

	shr		ebx,16
	xor		edx,esi

	xor		edx,ebx

	dec		ecx
	jnz		remLoop
	pop		ecx
	ret
RSrem	endp

mdsOffsTab	dd	0,4,2*1024,2*1024+4
			;	    LSB            MSB
pTab		dd		P_01,P_11,P_21,P_31		;"last" stage (previous to MDSmat)
			dd		P_02,P_12,P_22,P_32
			dd		P_03,P_13,P_23,P_33	
			dd		P_04,P_14,P_24,P_34	
pPtrTab		dd		pTab,pTab+16,pTab+32,pTab+48

  if KEY_MODE AND (KM_PART or KM_MIN)
x8Tab		dd		0,100h,200h,300h
  endif
;
reKeySharedSize	=	$-RSrem

ldCache	macro addr,byteCnt,cpuName
  ifdif <cpuName>,<PentiumPro>
NN=0
  rept (byteCnt+63)/64			;force cache line load (Pentium only)
    irp QQ,<%(NN)>
    mov		eax,addr[QQ]
	endm
   if (NN+32) lt byteCnt
    irp QQ,<%(NN+36)>
      mov	ebx,addr[QQ]
	endm
   endif
NN=NN+64
  endm
  endif
endm

pXor8	macro	dst,src,reg,N
	mov		eax,src+N
	mov		ebx,src+N+4
	xor		eax,reg
	xor		ebx,reg
	mov		dst+N,eax
	mov		dst+N+4,ebx
endm

ld8		macro	dstReg,src,cpuName	;;__TRANSPARENT__
  ifdif <cpuName>,<PentiumPro>
	mov		dstReg&l,byte ptr src
  else
	movzx	e&dstReg&x,byte ptr src
  endif
endm

skLdKey	macro	keySize,N,cpuName	;;__TRANSPARENT__
  ifdif <cpuName>,<PentiumPro>
	mov		cl,byte ptr lKey32[N]
	mov		dl,byte ptr lKey32[N+8]
   if keySize eq 128
	mov		ebp,ecx
	mov		esi,edx					;load esi/ebp with key bytes
   elseif keySize eq 192
	mov		tmp1,ecx				;save these
	mov		cl,byte ptr lKey32[N+16]
	mov		ebp,edx
	mov		tmp3,ecx
	mov		esi,ecx
   else ;keySize eq 256
	mov		tmp0,ecx				;save copy on stack
	mov		cl,byte ptr lKey32[N+16]
	mov		tmp1,edx
	mov		dl,byte ptr lKey32[N+24]
	mov		tmp2,ecx
	mov		ebp,ecx
	mov		tmp3,edx
	mov		esi,edx
   endif
  else
   if keySize eq 128
	movzx	ebp,byte ptr lKey32[N]	;load esi/ebp with key bytes
	movzx	esi,byte ptr lKey32[N+8]
   elseif keySize eq 192
    movzx	esi,byte ptr lKey32[N]
	movzx	ebp,byte ptr lKey32[N+8]
	mov		tmp1,esi
    movzx	esi,byte ptr lKey32[N+16]
	mov		tmp3,esi
   else	;keySize eq 256
	movzx	ebp,byte ptr lKey32[N]	;load esi/ebp with key bytes
	movzx	esi,byte ptr lKey32[N+8]
	mov		tmp0,ebp
	mov		tmp1,esi
	movzx	ebp,byte ptr lKey32[N+16]
	movzx	esi,byte ptr lKey32[N+24]
	mov		tmp2,ebp
	mov		tmp3,esi
   endif
  endif
endm
;
;
DO_CALL_Pentium			=	0		;inline code, or use calls?
DO_CALL_PentiumPro		=	0

  irp cpuName,<Pentium,PentiumPro>
    if DO_CALL_&cpuName
	  irp XX,<0,1,2,3>
	    concat	<public ASM_ALIGN_>,%(XX),<_>,cpuName
	  endm
	endif
  endm

jmpRet	macro	cpuName
	 jmp reKeyDone_&cpuName			;comment this line to do full testing
endm	
;
; precompute first step in 32-bit chunks??  Nope -- only saves ~ 40 clocks
;
; Input:	edi = i
;			esi = byte to xor with s0[] output
;			ebp = byte to xor with s1[] output
;
subKey8x32Proc	macro	keySize,N,cpuName,s0,s1,s2,s3
  if DO_CALL_&cpuName
	align	4
LSK_OFFS	=	4
  else
LSK_OFFS	=	0
  endif
skProc_&keySize&_&N&_&cpuName:
	xor		ecx,ecx					;keep Pentium pairing properly aligned
	ld8		<a>,s0[edi],cpuName		;run through first 8x8 permutation

	xor		eax,esi					;xor with first key byte
	ld8		<b>,s0[edi+2],cpuName

	xor		ebx,esi
	ld8		<c>,s0[edi+4],cpuName

	xor		ecx,esi
	ld8		<d>,s0[edi+6],cpuName

	xor		edx,esi
	ld8		<a>,s1[eax],cpuName		;run through second 8x8

	xor		eax,ebp					;xor with second key byte
	ld8		<b>,s1[ebx],cpuName

	xor		ebx,ebp
	ld8		<c>,s1[ecx],cpuName

	xor		ecx,ebp
	ld8		<d>,s1[edx],cpuName
 ifnb <s2>
	xor		edx,ebp
	ld8		<a>,s2[eax],cpuName		;run through 3rd 8x8

	mov		esi,tmp1+4*DO_CALL_&cpuName
  if keySize gt 192
	mov		ebp,tmp0+4*DO_CALL_&cpuName
  endif
	xor		eax,esi					;xor with 3rd key byte
	ld8		<b>,s2[ebx],cpuName

	xor		ebx,esi
	ld8		<c>,s2[ecx],cpuName

	xor		ecx,esi
	ld8		<d>,s2[edx],cpuName

	xor		edx,esi
   ifnb <s3>
	ld8		<a>,s3[eax],cpuName		;run through 4th 8x8

	xor		eax,ebp					;xor with 4th key byte
	ld8		<b>,s3[ebx],cpuName

	xor		ebx,ebp
	ld8		<c>,s3[ecx],cpuName

	xor		ecx,ebp
	ld8		<d>,s3[edx],cpuName

	xor		edx,ebp	
   endif
	mov		eax,MDStab[4*eax+N*1024];do final 8x8 and MDS multiply lookup

	mov		esi,tmp3+4*DO_CALL_&cpuName	;reload old key bytes
   if keySize gt 192
	mov		ebp,tmp2+4*DO_CALL_&cpuName	;(for next time)
   endif
  else
	xor		edx,ebp	
	mov		eax,MDStab[4*eax+N*1024];do final 8x8 and MDS multiply lookup
  endif

concat <    mov     lSubkey[4*edi+N*SUBKEY_SIZE][>,%LSK_OFFS,<],eax>
	mov		eax,MDStab[4*ebx+N*1024]

concat <    mov     lSubkey[4*edi+N*SUBKEY_SIZE+8][>,%LSK_OFFS,<],eax>
	mov		eax,MDStab[4*ecx+N*1024]

concat <    mov     lSubkey[4*edi+N*SUBKEY_SIZE+16][>,%LSK_OFFS,<],eax>
	mov		eax,MDStab[4*edx+N*1024]

concat <    mov     lSubkey[4*edi+N*SUBKEY_SIZE+24][>,%LSK_OFFS,<],eax>
	xor		eax,eax

	sub		edi,8
	jae		skProc_&keySize&_&N&_&cpuName

  if DO_CALL_&cpuName
	ret	0
	db		((1 + (15-(($-skProc_&N&_&cpuName) AND 15))) AND 15) dup (0)
  endif
endm

skProc	macro	keySize,N,cpuName		;;__TRANSPARENT__
  if keySize eq 128
	subKey8x32Proc	keySize,N,cpuName,P_&N&2,P_&N&1
  elseif keySize eq 192
	subKey8x32Proc	keySize,N,cpuName,P_&N&3,P_&N&2,P_&N&1
  elseif keySize eq 256
	subKey8x32Proc	keySize,N,cpuName,P_&N&4,P_&N&3,P_&N&2,P_&N&1
  else
	Invalid_KeySize_Error;;
  endif
endm

skEvenOdd	macro	keySize,N,cpuName
  if N eq 0
	mov		edi,TOTAL_SUBKEYS-8		;do the even ones first (backwards)
  else
	add		edi,TOTAL_SUBKEYS-1
  endif
	skLdKey	keySize,N,cpuName
  if DO_CALL_&cpuName
	call	skProc_&keySize&_&N&_&cpuName
	add		edi,TOTAL_SUBKEYS+1		;go back for the odd ones
	skLdKey	keySize,N+4,cpuName
	call	skProc_&keySize&_&N&_&cpuName
  else
	skProc	keySize,N,cpuName		;instantiate the code inline
	test	edi,1					;and loop twice for even/odd subkeys
	jnz		short sk&keySize&_&N&_done_&cpuName
	  add	edi,TOTAL_SUBKEYS+1		;go back for the odd ones
	  skLdKey	keySize,N+4,cpuName
	  jmp	skProc_&keySize&_&N&_&cpuName
	  align	4
sk&keySize&_&N&_done_&cpuName:
  endif
endm

;
_S_	equ		<tmpSbox8[12]>			;use this to reference tmpSbox
;
; int reKey(keyInstance *keyPtr);	// build the key schedule
;
reKeyProc	macro	cpuName
reKey_Start_&cpuName:
  if DO_CALL_&cpuName
	db		1 dup (0)				;align
	irp		kSize,<256,192,128>
ASM_ALIGN_&kSize&_0_&cpuName:	skProc %kSize,0,cpuName
ASM_ALIGN_&kSize&_1_&cpuName:	skProc %kSize,1,cpuName
ASM_ALIGN_&kSize&_2_&cpuName:	skProc %kSize,2,cpuName
ASM_ALIGN_&kSize&_3_&cpuName:	skProc %kSize,3,cpuName
	endm
bigKeyCode_&cpuName = (ASM_ALIGN_128_0_&cpuName-ASM_ALIGN_256_0_&cpuName)
   endif
;
TwoFishReKey_&cpuName proc 
	pushad 
	mov		ebp,keyPtr-localSize2
	sub		esp,localSize2 

	xor		edi,edi	
	biasEBP

	mov		ecx,ks.keyLen			;should be 128, 192, or 256
	shr		ecx,6					;divide by 64
	ldCache	[esp],localSize2,cpuName
	ldCache	[ks.key32],%(fullSbox-key32),cpuName
;	jmpRet  cpuName    ;;  ??? to here (MMX: 160)(Pro:  160) ["Nothing"]

	;copy over key material, do the Reed-Solomon thing
	mov		kLen64,ecx				;store key length
rsLoop_&cpuName:
	mov		edx,ks.key32[8*edi+4]
	mov		lKey32[8*edi+4],edx		;build a local copy of key bytes
	call	RSrem					;process the first four bytes
	mov		eax,ks.key32[8*edi]		;get next two key material dwords
	mov		lKey32[8*edi],eax		;(local copy)
	inc		edi						;bump ptr
	xor		edx,eax
	call	RSrem					;now process the final four
	mov		SboxKey[4*ecx-4],edx	;store result in reverse order
	mov		ks.sboxKeys[4*ecx-4],edx
 if KEY_MODE and KM_ZERO	
  ifidn <cpuName>,<PentiumPro>
	lea		esi,[4*ecx-4]
	movzx	eax,dl					;expand Pentium Pro keys: dword --> four bytes
	movzx	ebx,dh
	mov		sboxK8[4*esi],eax
	shr		edx,16
	mov		sboxK8[4*esi+4],ebx
	movzx	eax,dh
	mov		sboxK8[4*esi+12],eax
	movzx	edx,dl
	mov		sboxK8[4*esi+8],edx
  endif
 endif
	dec		ecx
	jg		rsLoop_&cpuName

	mov		esi,skJmpTab_&cpuName[4*edi-4]
	; compute all the subkey s-box results
	xor		edi,edi					;edi = i in subKey loop
	xor		eax,eax
	xor		ebx,ebx
	xor		ecx,ecx
	xor		edx,edx
;	jmpRet  cpuName    ;;  487 to here (MMX: 451)(Pro:  315) ["RSrem"]
	jmp		esi

  irp kBits,<256,192,128>
concat <sk>,%(kBits),<_>,cpuName:
	skEvenOdd	%(kBits),0,cpuName
	skEvenOdd	%(kBits),1,cpuName
	skEvenOdd	%(kBits),2,cpuName
	skEvenOdd	%(kBits),3,cpuName
   if (kBits ne 128)
    jmp		sk8Done_&cpuName		;put 128-bit key last to fall through!
   endif
  endm
  ife DO_CALL_&cpuName				;show how big the 192/256 bit code is
bigKeyCode_&cpuName = (sk128_&cpuName-sk256_&cpuName)
  endif

sk8Done_&cpuName:
;	jmpRet  cpuName    ;; 1850 to here (MMX:1100)(Pro:  959) ["4skEvenOdd"]
	add		edi,TOTAL_SUBKEYS-1+6	;build in reverse order
	mov		ebp,keyPtr				;put final stuff here
	biasEBP

	; now run subkey s-box bytes through MDS and rotate/PHT/combine
subkeyLp_&cpuName:
	mov		eax,lSubkey[4*edi]
	mov		ebx,lSubkey[4*edi+4]

	mov		ecx,lSubkey[4*edi+SUBKEY_SIZE]
	mov		edx,lSubkey[4*edi+SUBKEY_SIZE+4]

	xor		eax,ecx
2fish_86.asm - 源码说明

本页面展示了「一个towfish加密算法的源代码」中的 2fish_86.asm 源码文件，采用汇编编程语言编写，共 1,769 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与towfish相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?