⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cast5.cpp

📁 vc环境下的pgp源码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
 * high-speed x86 CAST implementation.  Optimized for Intel P5 and P6.
 * (Pentium, PPro, Pentium II).
 *
 * The pentium is a fairly straightforward dual-issue machine with a
 * number of issue restrictions (but the only false hazard that causes
 * a stall is WAW).  The one significant pipeline hazard is that while
 * loads are complete the cycle after they execute, they require that
 * the address be available the cycle before they execute.  One notable
 * point is that the Pentium can execute two loads in parallel, as long
 * as they hit in different banks of its 8-way-interleaved L1 cache.
 *
 * The PPro is an out-of-order superscalar system which can dispatch 3
 * instructions per cycle (subject to significant limitations) and has
 * two integer and one load/store unit.  After decoding x86 instructions
 * to a series of R-ops, it's pretty straightforward out-of-order engine,
 * executing instructions as their operands become available (register
 * renaming ensures that false sharing doesn't block issue) except for
 * one thing.  The P6 is prone to an evil thing called a "partial register
 * stall", when you write to a partial register and read a larger part.
 * this overlap is resolved by retiring the partial write before the
 * next instruction can be dispatched.  This amounts to a *long* time.
 *
 * Unfortunately, precisely this operation is the best way to extract the
 * bytes from the words in the CAST S-box lookup, so it's very important
 * for an efficient CAST implementation.
 * Fortunately, the processor has an exception built in.  The
 * instructions "xor reg,reg" and "sub reg,reg" sets a magic "clear"
 * flag on the register, following which the merge of the high zero
 * bits causes no delay.
 * The only problem is that a context switch or interrupt will save
 * and restore the register, and a mov does *not* set the flag.
 * Thus, you should explicitly re-clear the register occasionally,
 * so that the stall won't happen.
 *
 * Each block encrypted seems like a good value for "occasionally".
 *
 * Also, loop labels whould not be within 7 bytes of a 16-byte boundary.
 */

/*
 * Do round n from x into y, with op1/op2/op3
 * This also loads in the next round's keys into
 * eax and ecx.
 *
 * The minimum length dependency chain is 9 cycles for
 * a round, but achieving that would require issuing
 * three instructions per cycle - perhaps the PPro can
 * manage that.  Issuing two instructions per cycle,
 * a round can be done in 10 cycles.  Unfortunately,
 * the processor does not have enough register to use more
 * parallelism than that.
 *
 * Dependency chain:
 * op3 x,%eax/xor %edx,%edx
 * rol %cl,%eax
 * mov %ah,%bl/mov %al,%dl
 * shr $16,%eax/XXX/AGI
 * mov S1[%ebx],%ecx/mov %ah,%ebx	\ Could be mult-issued
 * mov S2[%edx],%edx/and $255,%eax	/ if we had the bandwidth
 * op1 %edx,%ecx/mov S3[%ebx],%edx
 * op2 %edx,%ecx/mov S4[%eax],%eax
 * op3 %eax,%ecx/mov key,%eax
 * xor %ecx,y/mov shift,%ecx
 *
 */
/*
 * This requires that %ebx be kept "clear" in the P6 sense.
 *
 * NOTE that the NOP *saves* a cycle on the Pentium!  Without it,
 * the following mov is paired with the shr, and suffers an AGI stall,
 * which stalls the shr as well and thus loses two issue slots instead
 * of one.  We have a spare issue slot, but the shr is on the critical
 * path and delaying it adds a cycle.
 */

#define ROUND(x,y,op1,op2,op3,n)					\
	__asm											\
	{												\
		__asm op3 eax, x				/* 1 U  */	\
		__asm xor edx, edx				/*    V */	\
		__asm rol eax, cl				/*2-5NP */	\
		__asm mov bl, ah				/* 6 U  */	\
		__asm mov dl, al				/*    V */	\
		__asm shr eax, 16				/* 7 Ux */	\
		__asm nop /* !!! */				/* AGI  */	\
		__asm mov ecx, S1[ebx*4]		/* 8 U  */	\
		__asm mov bl, ah				/*    V */	\
		__asm mov edx, S2[edx*4]		/* 9 U  */	\
		__asm and eax, 255				/*    V */	\
		__asm op1 ecx, edx				/*10 U  */	\
		__asm mov edx, S3[ebx*4]		/*    V */	\
		__asm op2 ecx, edx				/*11 U  */	\
		__asm mov edx, S4[eax*4]		/*    V */	\
		__asm op3 ecx, edx				/*12 U  */	\
		__asm mov eax, 8+[esi+8*n]		/*    V */	\
		__asm xor y, ecx				/*13 U  */	\
		__asm mov ecx, 12+[esi+8*n]		/*    V */	\
	}

/* void
 * CAST5encrypt(byte const *in, byte *out, PGPUInt32 const *xkey)
 *                          4         8                  12
 *
 * Register usage:
 * esi points to key schedule
 * eax is used as a temporary, and used for the primary round subkey
 * ebx is all-zero in the high 24 bits, and is used for indexing
 * ecx is used as a temporary, and for the rotate round subkey (PLUS 16)
 * edx is all-zero in the high 24 bits, and is used for indexing
 * esi points to the key schedule
 * edi is the right half of the current block
 * ebp is the left half of the current block
 */

#define	left	ebp
#define	right	edi

NAKED 
void 
__cdecl 
CAST5encryptAsm(const PGPUInt8 *in, PGPUInt8 *out, const PGPUInt32 *xkey)
{
	__asm
	{
		ALIGN 16

		push esi				/*   U  */
		push right				/*    V */

		mov right, 8+[esp+4]	/*   U  */
		mov esi, 8+[esp+12]		/*    V */
		push left				/*   U  */
		push ebx				/*    V */
		mov left, [right]		/*   U  */
		mov right, [right+4]	/*    V */

		bswap right				/*  NP  */
		bswap left				/*  NP  */

		mov eax, [esi]			/*   U  */
		xor ebx,ebx				/*    V */
		mov ecx, [esi+4]		/*   U  */

		ROUND(right, left, xor, sub, add, 0)
		ROUND(left, right, sub, add, xor, 1)
		ROUND(right, left, add, xor, sub, 2)
		ROUND(left, right, xor, sub, add, 3)
		ROUND(right, left, sub, add, xor, 4)
		ROUND(left, right, add, xor, sub, 5)
		ROUND(right, left, xor, sub, add, 6)
		ROUND(left, right, sub, add, xor, 7)
		ROUND(right, left, add, xor, sub, 8)
		ROUND(left, right, xor, sub, add, 9)
		ROUND(right, left, sub, add, xor,10)
		ROUND(left, right, add, xor, sub,11)
		ROUND(right, left, xor, sub, add,12)
		ROUND(left, right, sub, add, xor,13)
		ROUND(right, left, add, xor, sub,14)
	/*	ROUND(left, right, xor, sub, add,15)
	 * Last round: omit loading of keys for next round
	 * Fetch out pointer and store data there instead
	 */
		add eax, left			/* 1 U  */
		xor edx, edx			/*    V */
		rol eax, cl				/*2-5NP */
		mov bl, ah				/* 6 U  */
		mov dl, al				/*    V */
		shr eax, 16				/* 7 Ux */
		mov esi, 16+[esp+8]		/*    V */
		mov ecx, S1[ebx*4]		/* 8 U  */
		mov bl, ah				/*    V */
		mov edx, S2[edx*4]		/* 9 U  */
		and eax, 255			/*    V */
		xor ecx, edx			/*10 U  */
		mov edx, S3[ebx*4]		/*    V */
		sub ecx, edx			/*11 U  */
		mov eax, S4[eax*4]		/*    V */
		add ecx, eax			/*12 U  */
		pop ebx					/*    V */
		bswap left				/*13 NP */
		xor right, ecx			/*14 U  */

		mov [esi+4], left		/*    V */
		bswap right				/*   NP */
		pop left				/*   U  */
		mov [esi], right		/*    V */
		pop right				/*   U  */
		pop esi					/*    V */
		ret						/*   NP */
	}
}

/* 
 * asm void
 * CAST5encryptCFBdbl(
 * 	register PGPUInt32 const *xkey,	// esp+pushes+ 4
 * 	register PGPUInt32 in0,		// esp+pushes+ 8
 * 	register PGPUInt32 in1,		// esp+pushes+12
 * 	register PGPUInt32 in2,		// esp+pushes+16
 * 	register PGPUInt32 in3,		// esp+pushes+20
 * 	register PGPUInt32 const *src,	// esp+pushes+24
 * 	register PGPUInt32 *dest,		// esp+pushes+28
 * 	register PGPUInt32 len)		// esp+pushes+32
 *
 * Note that "len" is the number of 16-byte units to encrypt.
 * Since this function only encrypts one block per time
 * around the loop, it has to be doubled.
 *
 * Doing the dbl part...
 * We use the srgument slots on the stack as IV registers, but
 * actually only use one IV, in the in2/in3 slots.
 * Each iteration, we fetch from them the data to be encrypted
 * for the next iteration before storing the current
 * ciphertext for the ineration after the next, thus achieving the
 * necessary interleaving.
 */

#define	left	ebp
#define	right	edi

NAKED 
void 
__cdecl 
CAST5encryptCFBdblAsm(
	const PGPUInt32	*xkey, 
	PGPUInt32		iv0, 
	PGPUInt32		iv1, 
	PGPUInt32		iv2, 
	PGPUInt32		iv3, 
	const PGPUInt32	*src, 
	PGPUInt32		*dest, 
	PGPUInt32		len)
{
	__asm
	{
		ALIGN 16

		push esi
		push right

		mov esi, 8+[esp+4]		/*   U  - load key schedule pointer */
		push left				/*    V */
		mov left, 12+[esp+8]	/*   U  - load in0 as left */
		mov right, 12+[esp+12]	/*    V - load in1 as right */

		push ebx				/*   U  */
		xor ebx,ebx				/*    V */

		shl dword ptr 16+[esp+32], 1	/*   NP - double loop counter */

	encryptloop:
		mov eax, [esi]			/*   U  - preload key material */
		mov ecx, [esi+4]		/*    V - preload key material */

		bswap right				/*   NP  */
		bswap left				/*   NP  */

		ROUND(right, left, xor, sub, add, 0)
		ROUND(left, right, sub, add, xor, 1)
		ROUND(right, left, add, xor, sub, 2)
		ROUND(left, right, xor, sub, add, 3)
		ROUND(right, left, sub, add, xor, 4)
		ROUND(left, right, add, xor, sub, 5)
		ROUND(right, left, xor, sub, add, 6)
		ROUND(left, right, sub, add, xor, 7)
		ROUND(right, left, add, xor, sub, 8)
		ROUND(left, right, xor, sub, add, 9)
		ROUND(right, left, sub, add, xor,10)
		ROUND(left, right, add, xor, sub,11)
		ROUND(right, left, xor, sub, add,12)
		ROUND(left, right, sub, add, xor,13)
		ROUND(right, left, add, xor, sub,14)
	/*	ROUND(left, right, xor, sub, add,15)
	 * Last round: omit loading of keys for next round
	 * Instead, start the CFB operations.  Including the
	 * swap of the halves, that ends up as:
	 *
	 * %eax = src[0] ^ bswap(right)
	 * %ecx = src[1] ^ bswap(left)
	 * src += 8bytes
	 * left = bswap(in2)
	 * right = bswap(in3)
	 * in2 = %eax
	 * dest[0] = %eax
	 * in3 = %ecx
	 * dest[1] = %eax
	 * dest += 8bytes
	 *
	 * For the '486, we can just use bswap.  For the '386, it's
	 * xchg ah,al
	 * rol $16,eax
	 * xchg ah,al
	 *
	 * Annoyingly, this really *sucks* on a PPro, due to fierce
	 * partial-register stalls.  So it pretty much has to go two ways.
	 * Options are:
	 * - Duplicate the entire encryption loop?
	 * - Do it in a pre-pass and a post-pass.  Makes life easy, but we
	 *   end up being memory bound.
	 * - Something truly wierd?
	 */
		add eax, left			/* 1 U  */
		xor edx, edx			/*    V */
		rol eax, cl				/*2-5NP */
		mov bl, ah				/* 6 U  */
		mov dl, al				/*    V */
		shr eax, 16				/* 7 Ux */
		nop						/*    V */
		mov ecx, S1[ebx*4]		/* 8 U  */
		mov bl, ah				/*    V */
		mov edx, S2[edx*4]		/* 9 U  */
		and eax, 255			/*    V */
		xor ecx, edx			/*10 U  */
		mov edx, S3[ebx*4]		/*    V */
		sub ecx, edx			/*11 U  */
		mov edx, S4[eax*4]		/*    V */
		add ecx, edx			/*12 U  */
		mov ebx, 16+[esp+24]	/*    V - fetch src pointer */
		xor right, ecx			/*13 U  */
		add ebx, 8				/*    V - increment src ptr */
		bswap left
  		bswap right
		mov eax, [ebx-8]		/*   U  - get src word */
		mov ecx, [ebx-4]		/*    V - other src word */
		mov 16+[esp+24], ebx	/*   U  - store src pointer back */
		mov edx, 16+[esp+28]	/*    V - fetch dest pointer */
		xor eax, right			/*   U  */
		xor ecx, left			/*    V */
		mov left, 16+[esp+16]	/*   U  - fetch in2 for new left */
		mov right, 16+[esp+20]	/*    V - fetch in3 for new right */
		mov 16+[esp+16], eax	/*   U  - store ciphertext for next time */
		mov 16+[esp+20], ecx	/*    V - store ciphertext for next time */
		mov [edx+0], eax		/*   U  - store result */
		mov [edx+4], ecx		/*    V - store result */
		add edx, 8				/*   U  - increment dest ptr */
		xor ebx, ebx			/*    V - clear %ebx for next iteration */
		dec dword ptr 16+[esp+32]	/*   U  - decrement loop counter (set ZF) */
		mov 16+[esp+28], edx	/*    V - store dest pointer back */
		/* Pairing opportunity lost, sigh */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -