📄 cast5.cpp

📁 PGP8.0源码请认真阅读您的文件包然后写出其具体功能
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*____________________________________________________________________________
		Copyright (C) 2002 PGP Corporation
        All rights reserved.

        $Id: Cast5.cpp,v 1.3 2002/08/06 20:10:19 dallen Exp $
____________________________________________________________________________*/

#include "pgpClassesConfig.h"

#include "Cast5.h"
#include "Cast5Box.h"

// Macros

#define ROL(x,r) ((x)<<(r) | (x)>>(32-(r)))

#define B0(x) ((x) >> 24 & 255)
#define B1(x) ((x) >> 16 & 255)
#define B2(x) ((x) >> 8 & 255)
#define B3(x) ((x) & 255)

#define F1(x,xkey,i) (ROL((xkey)[2*(i)] + (x), (xkey)[2*(i)+1]))
#define F2(x,xkey,i) (ROL((xkey)[2*(i)] ^ (x), (xkey)[2*(i)+1]))
#define F3(x,xkey,i) (ROL((xkey)[2*(i)] - (x), (xkey)[2*(i)+1]))

#define G1(x) (((S1[B0(x)] ^ S2[B1(x)]) - S3[B2(x)]) + S4[B3(x)])
#define G2(x) (((S1[B0(x)] - S2[B1(x)]) + S3[B2(x)]) ^ S4[B3(x)])
#define G3(x) (((S1[B0(x)] + S2[B1(x)]) ^ S3[B2(x)]) - S4[B3(x)])

#define x0 B0(x0123)
#define x1 B1(x0123)
#define x2 B2(x0123)
#define x3 B3(x0123)
#define x4 B0(x4567)
#define x5 B1(x4567)
#define x6 B2(x4567)
#define x7 B3(x4567)
#define x8 B0(x89AB)
#define x9 B1(x89AB)
#define xA B2(x89AB)
#define xB B3(x89AB)
#define xC B0(xCDEF)
#define xD B1(xCDEF)
#define xE B2(xCDEF)
#define xF B3(xCDEF)
#define z0 B0(z0123)
#define z1 B1(z0123)
#define z2 B2(z0123)
#define z3 B3(z0123)
#define z4 B0(z4567)
#define z5 B1(z4567)
#define z6 B2(z4567)
#define z7 B3(z4567)
#define z8 B0(z89AB)
#define z9 B1(z89AB)
#define zA B2(z89AB)
#define zB B3(z89AB)
#define zC B0(zCDEF)
#define zD B1(zCDEF)
#define zE B2(zCDEF)
#define zF B3(zCDEF)


// CAST5 key manipulation

// This expands a 128-bit key to a 32-word scheduled key, where each round
// uses two words: a 32-bit XOR mask and a 5-bit rotate amount.  Shorter keys
// are just padded with zeros, and if they are 80 bits or less, the cipher
// is reduced to 12 rounds (not implemented here).
//
// The feed-forward used with x0123 through yCDEF prevent any weak keys,
// and the substitution to set up the xkey tables ensure that the subround
// keys are not easily derivable from each other, so linear cryptanalysis
// won't do very well against CAST.

void 
__cdecl 
CAST5Schedule(PGPUInt32 xkey[32], const PGPByte *k)
{
	PGPUInt32	x0123, x4567, x89AB, xCDEF;
	PGPUInt32	z0123, z4567, z89AB, zCDEF;

	// Initialize x0123456789ABCDEF with input key
	x0123	= (PGPUInt32) k[0]<<24 | (PGPUInt32) k[1]<<16 | 
		(PGPUInt32) k[2]<<8 | k[3];

	x4567	= (PGPUInt32) k[4]<<24 | (PGPUInt32) k[5]<<16 | 
		(PGPUInt32) k[6]<<8 | k[7];

	x89AB	= (PGPUInt32) k[8]<<24 | (PGPUInt32) k[9]<<16 | 
		(PGPUInt32) k[10]<<8 | k[11];

	xCDEF	= (PGPUInt32) k[12]<<24 | (PGPUInt32) k[13]<<16 | 
		(PGPUInt32) k[14]<<8 | k[15];

	// Now set up the key table
	for (PGPUInt8 i = 0; i < 4; i++) 
	{
		z0123	= x0123 ^ S5[xD] ^ S6[xF] ^ S7[xC] ^ S8[xE] ^ S7[x8];
		z4567	= x89AB ^ S5[z0] ^ S6[z2] ^ S7[z1] ^ S8[z3] ^ S8[xA];
		z89AB	= xCDEF ^ S5[z7] ^ S6[z6] ^ S7[z5] ^ S8[z4] ^ S5[x9];
		zCDEF	= x4567 ^ S5[zA] ^ S6[z9] ^ S7[zB] ^ S8[z8] ^ S6[xB];
		x0123	= z89AB ^ S5[z5] ^ S6[z7] ^ S7[z4] ^ S8[z6] ^ S7[z0];
		x4567	= z0123 ^ S5[x0] ^ S6[x2] ^ S7[x1] ^ S8[x3] ^ S8[z2];
		x89AB	= z4567 ^ S5[x7] ^ S6[x6] ^ S7[x5] ^ S8[x4] ^ S5[z1];
		xCDEF	= zCDEF ^ S5[xA] ^ S6[x9] ^ S7[xB] ^ S8[x8] ^ S6[z3];

		switch (i)
		{
		case 0:			// Masking keys, rounds 0..7
			xkey[0]		= S5[z8] ^ S6[z9] ^ S7[z7] ^ S8[z6] ^ S5[z2];
			xkey[2]		= S5[zA] ^ S6[zB] ^ S7[z5] ^ S8[z4] ^ S6[z6];
			xkey[4]		= S5[zC] ^ S6[zD] ^ S7[z3] ^ S8[z2] ^ S7[z9];
			xkey[6]		= S5[zE] ^ S6[zF] ^ S7[z1] ^ S8[z0] ^ S8[zC];
			
			xkey[8]		= S5[x3] ^ S6[x2] ^ S7[xC] ^ S8[xD] ^ S5[x8];
			xkey[10]	= S5[x1] ^ S6[x0] ^ S7[xE] ^ S8[xF] ^ S6[xD];
			xkey[12]	= S5[x7] ^ S6[x6] ^ S7[x8] ^ S8[x9] ^ S7[x3];
			xkey[14]	= S5[x5] ^ S6[x4] ^ S7[xA] ^ S8[xB] ^ S8[x7];
			break;

		 case 1:			// Masking keys, rounds 8..15
			xkey[16]	= S5[z3] ^ S6[z2] ^ S7[zC] ^ S8[zD] ^ S5[z9];
			xkey[18]	= S5[z1] ^ S6[z0] ^ S7[zE] ^ S8[zF] ^ S6[zC];
			xkey[20]	= S5[z7] ^ S6[z6] ^ S7[z8] ^ S8[z9] ^ S7[z2];
			xkey[22]	= S5[z5] ^ S6[z4] ^ S7[zA] ^ S8[zB] ^ S8[z6];
			
			xkey[24]	= S5[x8] ^ S6[x9] ^ S7[x7] ^ S8[x6] ^ S5[x3];
			xkey[26]	= S5[xA] ^ S6[xB] ^ S7[x5] ^ S8[x4] ^ S6[x7];
			xkey[28]	= S5[xC] ^ S6[xD] ^ S7[x3] ^ S8[x2] ^ S7[x8];
			xkey[30]	= S5[xE] ^ S6[xF] ^ S7[x1] ^ S8[x0] ^ S8[xD];
			break;

		 case 2:			// Rotation keys, rounds 0..7
			xkey[1]		= (S5[z8]^S6[z9]^S7[z7]^S8[z6]^S5[z2]) & 31;
			xkey[3]		= (S5[zA]^S6[zB]^S7[z5]^S8[z4]^S6[z6]) & 31;
			xkey[5]		= (S5[zC]^S6[zD]^S7[z3]^S8[z2]^S7[z9]) & 31;
			xkey[7]		= (S5[zE]^S6[zF]^S7[z1]^S8[z0]^S8[zC]) & 31;
			
			xkey[9]		= (S5[x3]^S6[x2]^S7[xC]^S8[xD]^S5[x8]) & 31;
			xkey[11]	= (S5[x1]^S6[x0]^S7[xE]^S8[xF]^S6[xD]) & 31;
			xkey[13]	= (S5[x7]^S6[x6]^S7[x8]^S8[x9]^S7[x3]) & 31;
			xkey[15]	= (S5[x5]^S6[x4]^S7[xA]^S8[xB]^S8[x7]) & 31;
			break;

		  case 3:			// Rotation keys, rounds 8..15
			xkey[17]	= (S5[z3]^S6[z2]^S7[zC]^S8[zD]^S5[z9]) & 31;
			xkey[19]	= (S5[z1]^S6[z0]^S7[zE]^S8[zF]^S6[zC]) & 31;
			xkey[21]	= (S5[z7]^S6[z6]^S7[z8]^S8[z9]^S7[z2]) & 31;
			xkey[23]	= (S5[z5]^S6[z4]^S7[zA]^S8[zB]^S8[z6]) & 31;
		
			xkey[25]	= (S5[x8]^S6[x9]^S7[x7]^S8[x6]^S5[x3]) & 31;
			xkey[27]	= (S5[xA]^S6[xB]^S7[x5]^S8[x4]^S6[x7]) & 31;
			xkey[29]	= (S5[xC]^S6[xD]^S7[x3]^S8[x2]^S7[x8]) & 31;
			xkey[31]	= (S5[xE]^S6[xF]^S7[x1]^S8[x0]^S8[xD]) & 31;
			break;
		}
	}

	x0123 = x4567 = x89AB = xCDEF = 0;
	z0123 = z4567 = z89AB = zCDEF = 0;

#if USE_CAST5_ASSEMBLY

	for (i = 0; i < 32; i += 2)
	{
		xkey[i + 1] = xkey[i + 1] ^ 16;
	}

#endif	// USE_CAST5_ASSEMBLY
}


#if !USE_CAST5_ASSEMBLY

void 
__cdecl 
CAST5Encrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
	PGPUInt32	l, r, t;

	l = (PGPUInt32) in[0]<<24 | (PGPUInt32) in[1]<<16 | 
		(PGPUInt32) in[2]<<8 | in[3];

	r = (PGPUInt32) in[4]<<24 | (PGPUInt32) in[5]<<16 | 
		(PGPUInt32) in[6]<<8 | in[7];

	t = F1(r, xkey,  0); l ^= G1(t);
	t = F2(l, xkey,  1); r ^= G2(t);
	t = F3(r, xkey,  2); l ^= G3(t);
	t = F1(l, xkey,  3); r ^= G1(t);
	t = F2(r, xkey,  4); l ^= G2(t);
	t = F3(l, xkey,  5); r ^= G3(t);
	t = F1(r, xkey,  6); l ^= G1(t);
	t = F2(l, xkey,  7); r ^= G2(t);
	t = F3(r, xkey,  8); l ^= G3(t);
	t = F1(l, xkey,  9); r ^= G1(t);
	t = F2(r, xkey, 10); l ^= G2(t);
	t = F3(l, xkey, 11); r ^= G3(t);
	// Stop here if only doing 12 rounds
	t = F1(r, xkey, 12); l ^= G1(t);
	t = F2(l, xkey, 13); r ^= G2(t);
	t = F3(r, xkey, 14); l ^= G3(t);
	t = F1(l, xkey, 15); r ^= G1(t);

	out[0]	= (PGPByte) B0(r);
	out[1]	= (PGPByte) B1(r);
	out[2]	= (PGPByte) B2(r);
	out[3]	= (PGPByte) B3(r);
	out[4]	= (PGPByte) B0(l);
	out[5]	= (PGPByte) B1(l);
	out[6]	= (PGPByte) B2(l);
	out[7]	= (PGPByte) B3(l);
}

void 
__cdecl 
CAST5Decrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
	PGPUInt32 l, r, t;

	r = (PGPUInt32) in[0]<<24 | (PGPUInt32) in[1]<<16 | 
		(PGPUInt32) in[2]<<8 | in[3];

	l = (PGPUInt32) in[4]<<24 | (PGPUInt32) in[5]<<16 | 
		(PGPUInt32) in[6]<<8 | in[7];

	t = F1(l, xkey, 15); r ^= G1(t);
	t = F3(r, xkey, 14); l ^= G3(t);
	t = F2(l, xkey, 13); r ^= G2(t);
	t = F1(r, xkey, 12); l ^= G1(t);
	// Start here if only doing 12 rounds
	t = F3(l, xkey, 11); r ^= G3(t);
	t = F2(r, xkey, 10); l ^= G2(t);
	t = F1(l, xkey,  9); r ^= G1(t);
	t = F3(r, xkey,  8); l ^= G3(t);
	t = F2(l, xkey,  7); r ^= G2(t);
	t = F1(r, xkey,  6); l ^= G1(t);
	t = F3(l, xkey,  5); r ^= G3(t);
	t = F2(r, xkey,  4); l ^= G2(t);
	t = F1(l, xkey,  3); r ^= G1(t);
	t = F3(r, xkey,  2); l ^= G3(t);
	t = F2(l, xkey,  1); r ^= G2(t);
	t = F1(r, xkey,  0); l ^= G1(t);

	out[0]	= (PGPByte) B0(l);
	out[1]	= (PGPByte) B1(l);
	out[2]	= (PGPByte) B2(l);
	out[3]	= (PGPByte) B3(l);
	out[4]	= (PGPByte) B0(r);
	out[5]	= (PGPByte) B1(r);
	out[6]	= (PGPByte) B2(r);
	out[7]	= (PGPByte) B3(r);
}

void 
__cdecl 
CAST5EncryptCFBdbl(
	const PGPUInt32*	xkey, 
	PGPUInt32			iv0, 
	PGPUInt32			iv1, 
	PGPUInt32			iv2, 
	PGPUInt32			iv3, 
	const PGPUInt32*	src, 
	PGPUInt32*			dest, 
	PGPUInt32			len)
{
	PGPUInt32	iv[4] = {iv0, iv1, iv2, iv3};

	while (len--) 
	{
		Encrypt((const PGPByte*) iv, (PGPByte*) iv, xkey);
		*dest++	= iv[0]	^= *src++;
		*dest++	= iv[1]	^= *src++;

		Encrypt((const PGPByte*) (iv+2), (PGPByte*) (iv+2), xkey);
		*dest++	= iv[2]	^= *src++;
		*dest++	= iv[3]	^= *src++;
	}
}

void 
__cdecl 
CAST5DecryptCFBdbl(
	const PGPUInt32*	xkey, 
	PGPUInt32			iv0, 
	PGPUInt32			iv1, 
	PGPUInt32			iv2, 
	PGPUInt32			iv3, 
	const PGPUInt32*	src, 
	PGPUInt32*			dest, 
	PGPUInt32			len)
{
	PGPUInt32	iv[4]	= {iv0, iv1, iv2, iv3};
	PGPUInt32	out[4];

	while (len--)
	{
		Encrypt((const PGPByte*) iv, (PGPByte*) out, xkey);
		*dest++	= out[0] ^ (iv[0] = *src++);
		*dest++	= out[1] ^ (iv[1] = *src++);

		Encrypt((const PGPByte*) (iv+2), (PGPByte*) (out+2), xkey);
		*dest++	= out[2] ^ (iv[2] = *src++);
		*dest++	= out[3] ^ (iv[3] = *src++);
	}
}

#else	// USE_CAST5_ASSEMBLY

/* NOTE: This expects the rotate amounts to be pre-incremented by 16! */

/*
 * high-speed x86 CAST implementation.  Optimized for Intel P5 and P6.
 * (Pentium, PPro, Pentium II).
 *
 * The pentium is a fairly straightforward dual-issue machine with a
 * number of issue restrictions (but the only false hazard that causes
 * a stall is WAW).  The one significant pipeline hazard is that while
 * loads are complete the cycle after they execute, they require that
 * the address be available the cycle before they execute.  One notable
 * point is that the Pentium can execute two loads in parallel, as long
 * as they hit in different banks of its 8-way-interleaved L1 cache.
 *
 * The PPro is an out-of-order superscalar system which can dispatch 3
 * instructions per cycle (subject to significant limitations) and has
 * two integer and one load/store unit.  After decoding x86 instructions
 * to a series of R-ops, it's pretty straightforward out-of-order engine,
 * executing instructions as their operands become available (register
 * renaming ensures that false sharing doesn't block issue) except for
 * one thing.  The P6 is prone to an evil thing called a "partial register
 * stall", when you write to a partial register and read a larger part.
 * this overlap is resolved by retiring the partial write before the
 * next instruction can be dispatched.  This amounts to a *long* time.
 *
 * Unfortunately, precisely this operation is the best way to extract the
 * bytes from the words in the CAST S-box lookup, so it's very important
 * for an efficient CAST implementation.
 * Fortunately, the processor has an exception built in.  The
 * instructions "xor reg,reg" and "sub reg,reg" sets a magic "clear"
 * flag on the register, following which the merge of the high zero
 * bits causes no delay.
 * The only problem is that a context switch or interrupt will save
 * and restore the register, and a mov does *not* set the flag.
 * Thus, you should explicitly re-clear the register occasionally,
 * so that the stall won't happen.
 *
 * Each block encrypted seems like a good value for "occasionally".
 *
 * Also, loop labels whould not be within 7 bytes of a 16-byte boundary.
 */

/*
 * Do round n from x into y, with op1/op2/op3
 * This also loads in the next round's keys into
 * eax and ecx.
 *
 * The minimum length dependency chain is 9 cycles for
 * a round, but achieving that would require issuing
 * three instructions per cycle - perhaps the PPro can
 * manage that.  Issuing two instructions per cycle,
 * a round can be done in 10 cycles.  Unfortunately,
 * the processor does not have enough register to use more
 * parallelism than that.
 *
 * Dependency chain:
 * op3 x,%eax/xor %edx,%edx
 * rol %cl,%eax
 * mov %ah,%bl/mov %al,%dl
 * shr $16,%eax/XXX/AGI
 * mov S1[%ebx],%ecx/mov %ah,%ebx	\ Could be mult-issued
 * mov S2[%edx],%edx/and $255,%eax	/ if we had the bandwidth
 * op1 %edx,%ecx/mov S3[%ebx],%edx
 * op2 %edx,%ecx/mov S4[%eax],%eax
 * op3 %eax,%ecx/mov key,%eax
 * xor %ecx,y/mov shift,%ecx
 *
 */
/*
 * This requires that %ebx be kept "clear" in the P6 sense.
 *
 * NOTE that the NOP *saves* a cycle on the Pentium!  Without it,
 * the following mov is paired with the shr, and suffers an AGI stall,
 * which stalls the shr as well and thus loses two issue slots instead
 * of one.  We have a spare issue slot, but the shr is on the critical
 * path and delaying it adds a cycle.
 */

#define ROUND(x,y,op1,op2,op3,n)					\
	__asm											\
	{												\
		__asm op3 eax, x				/* 1 U  */	\
		__asm xor edx, edx				/*    V */	\
		__asm rol eax, cl				/*2-5NP */	\
		__asm mov bl, ah				/* 6 U  */	\
		__asm mov dl, al				/*    V */	\
		__asm shr eax, 16				/* 7 Ux */	\
		__asm nop /* !!! */				/* AGI  */	\
		__asm mov ecx, S1[ebx*4]		/* 8 U  */	\
		__asm mov bl, ah				/*    V */	\
		__asm mov edx, S2[edx*4]		/* 9 U  */	\
		__asm and eax, 255				/*    V */	\
		__asm op1 ecx, edx				/*10 U  */	\
		__asm mov edx, S3[ebx*4]		/*    V */	\
		__asm op2 ecx, edx				/*11 U  */	\
		__asm mov edx, S4[eax*4]		/*    V */	\
		__asm op3 ecx, edx				/*12 U  */	\
		__asm mov eax, 8+[esi+8*n]		/*    V */	\
		__asm xor y, ecx				/*13 U  */	\
		__asm mov ecx, 12+[esi+8*n]		/*    V */	\
	}

#define ROUND2(x,y,op1,op2,op3,n)					\
	__asm											\
	{												\
		__asm op3 eax, x				/* 1 U  */	\
		__asm xor edx, edx				/*    V */	\
		__asm rol eax, cl				/*2-5NP */	\
		__asm mov bl, ah				/* 6 U  */	\
		__asm mov dl, al				/*    V */	\
		__asm shr eax, 16				/* 7 Ux */	\
		__asm nop /* !!! */				/* AGI  */	\
		__asm mov ecx, S1[ebx*4]		/* 8 U  */	\
		__asm mov bl, ah				/*    V */	\
		__asm mov edx, S2[edx*4]		/* 9 U  */	\
		__asm and eax, 255				/*    V */	\
		__asm op1 ecx, edx				/*10 U  */	\
		__asm mov edx, S3[ebx*4]		/*    V */	\
		__asm op2 ecx, edx				/*11 U  */	\
		__asm mov edx, S4[eax*4]		/*    V */	\
		__asm op3 ecx, edx				/*12 U  */	\
		__asm mov eax, -8+[esi+8*n]		/*    V */	\
		__asm xor y, ecx				/*13 U  */	\
		__asm mov ecx, -4+[esi+8*n]		/*    V */	\
	}

/* void
 * CAST5encrypt(byte const *in, byte *out, word32 const *xkey)
 *                          4         8                  12
 *
 * Register usage:
 * esi points to key schedule
 * eax is used as a temporary, and used for the primary round subkey
 * ebx is all-zero in the high 24 bits, and is used for indexing
 * ecx is used as a temporary, and for the rotate round subkey (PLUS 16)
 * edx is all-zero in the high 24 bits, and is used for indexing
 * esi points to the key schedule
 * edi is the right half of the current block
 * ebp is the left half of the current block
 */

#define	left	ebp
#define	right	edi

__declspec(naked) 
void 
__cdecl 
CAST5Encrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
	__asm
	{
		ALIGN 16

		push esi				/*   U  */
		push right				/*    V */
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -