📄 cast5.cpp
字号:
/*____________________________________________________________________________
Copyright (C) 2002 PGP Corporation
All rights reserved.
$Id: Cast5.cpp,v 1.3 2002/08/06 20:10:19 dallen Exp $
____________________________________________________________________________*/
#include "pgpClassesConfig.h"
#include "Cast5.h"
#include "Cast5Box.h"
// Macros
#define ROL(x,r) ((x)<<(r) | (x)>>(32-(r)))
#define B0(x) ((x) >> 24 & 255)
#define B1(x) ((x) >> 16 & 255)
#define B2(x) ((x) >> 8 & 255)
#define B3(x) ((x) & 255)
#define F1(x,xkey,i) (ROL((xkey)[2*(i)] + (x), (xkey)[2*(i)+1]))
#define F2(x,xkey,i) (ROL((xkey)[2*(i)] ^ (x), (xkey)[2*(i)+1]))
#define F3(x,xkey,i) (ROL((xkey)[2*(i)] - (x), (xkey)[2*(i)+1]))
#define G1(x) (((S1[B0(x)] ^ S2[B1(x)]) - S3[B2(x)]) + S4[B3(x)])
#define G2(x) (((S1[B0(x)] - S2[B1(x)]) + S3[B2(x)]) ^ S4[B3(x)])
#define G3(x) (((S1[B0(x)] + S2[B1(x)]) ^ S3[B2(x)]) - S4[B3(x)])
#define x0 B0(x0123)
#define x1 B1(x0123)
#define x2 B2(x0123)
#define x3 B3(x0123)
#define x4 B0(x4567)
#define x5 B1(x4567)
#define x6 B2(x4567)
#define x7 B3(x4567)
#define x8 B0(x89AB)
#define x9 B1(x89AB)
#define xA B2(x89AB)
#define xB B3(x89AB)
#define xC B0(xCDEF)
#define xD B1(xCDEF)
#define xE B2(xCDEF)
#define xF B3(xCDEF)
#define z0 B0(z0123)
#define z1 B1(z0123)
#define z2 B2(z0123)
#define z3 B3(z0123)
#define z4 B0(z4567)
#define z5 B1(z4567)
#define z6 B2(z4567)
#define z7 B3(z4567)
#define z8 B0(z89AB)
#define z9 B1(z89AB)
#define zA B2(z89AB)
#define zB B3(z89AB)
#define zC B0(zCDEF)
#define zD B1(zCDEF)
#define zE B2(zCDEF)
#define zF B3(zCDEF)
// CAST5 key manipulation
// This expands a 128-bit key to a 32-word scheduled key, where each round
// uses two words: a 32-bit XOR mask and a 5-bit rotate amount. Shorter keys
// are just padded with zeros, and if they are 80 bits or less, the cipher
// is reduced to 12 rounds (not implemented here).
//
// The feed-forward used with x0123 through yCDEF prevent any weak keys,
// and the substitution to set up the xkey tables ensure that the subround
// keys are not easily derivable from each other, so linear cryptanalysis
// won't do very well against CAST.
void
__cdecl
CAST5Schedule(PGPUInt32 xkey[32], const PGPByte *k)
{
PGPUInt32 x0123, x4567, x89AB, xCDEF;
PGPUInt32 z0123, z4567, z89AB, zCDEF;
// Initialize x0123456789ABCDEF with input key
x0123 = (PGPUInt32) k[0]<<24 | (PGPUInt32) k[1]<<16 |
(PGPUInt32) k[2]<<8 | k[3];
x4567 = (PGPUInt32) k[4]<<24 | (PGPUInt32) k[5]<<16 |
(PGPUInt32) k[6]<<8 | k[7];
x89AB = (PGPUInt32) k[8]<<24 | (PGPUInt32) k[9]<<16 |
(PGPUInt32) k[10]<<8 | k[11];
xCDEF = (PGPUInt32) k[12]<<24 | (PGPUInt32) k[13]<<16 |
(PGPUInt32) k[14]<<8 | k[15];
// Now set up the key table
for (PGPUInt8 i = 0; i < 4; i++)
{
z0123 = x0123 ^ S5[xD] ^ S6[xF] ^ S7[xC] ^ S8[xE] ^ S7[x8];
z4567 = x89AB ^ S5[z0] ^ S6[z2] ^ S7[z1] ^ S8[z3] ^ S8[xA];
z89AB = xCDEF ^ S5[z7] ^ S6[z6] ^ S7[z5] ^ S8[z4] ^ S5[x9];
zCDEF = x4567 ^ S5[zA] ^ S6[z9] ^ S7[zB] ^ S8[z8] ^ S6[xB];
x0123 = z89AB ^ S5[z5] ^ S6[z7] ^ S7[z4] ^ S8[z6] ^ S7[z0];
x4567 = z0123 ^ S5[x0] ^ S6[x2] ^ S7[x1] ^ S8[x3] ^ S8[z2];
x89AB = z4567 ^ S5[x7] ^ S6[x6] ^ S7[x5] ^ S8[x4] ^ S5[z1];
xCDEF = zCDEF ^ S5[xA] ^ S6[x9] ^ S7[xB] ^ S8[x8] ^ S6[z3];
switch (i)
{
case 0: // Masking keys, rounds 0..7
xkey[0] = S5[z8] ^ S6[z9] ^ S7[z7] ^ S8[z6] ^ S5[z2];
xkey[2] = S5[zA] ^ S6[zB] ^ S7[z5] ^ S8[z4] ^ S6[z6];
xkey[4] = S5[zC] ^ S6[zD] ^ S7[z3] ^ S8[z2] ^ S7[z9];
xkey[6] = S5[zE] ^ S6[zF] ^ S7[z1] ^ S8[z0] ^ S8[zC];
xkey[8] = S5[x3] ^ S6[x2] ^ S7[xC] ^ S8[xD] ^ S5[x8];
xkey[10] = S5[x1] ^ S6[x0] ^ S7[xE] ^ S8[xF] ^ S6[xD];
xkey[12] = S5[x7] ^ S6[x6] ^ S7[x8] ^ S8[x9] ^ S7[x3];
xkey[14] = S5[x5] ^ S6[x4] ^ S7[xA] ^ S8[xB] ^ S8[x7];
break;
case 1: // Masking keys, rounds 8..15
xkey[16] = S5[z3] ^ S6[z2] ^ S7[zC] ^ S8[zD] ^ S5[z9];
xkey[18] = S5[z1] ^ S6[z0] ^ S7[zE] ^ S8[zF] ^ S6[zC];
xkey[20] = S5[z7] ^ S6[z6] ^ S7[z8] ^ S8[z9] ^ S7[z2];
xkey[22] = S5[z5] ^ S6[z4] ^ S7[zA] ^ S8[zB] ^ S8[z6];
xkey[24] = S5[x8] ^ S6[x9] ^ S7[x7] ^ S8[x6] ^ S5[x3];
xkey[26] = S5[xA] ^ S6[xB] ^ S7[x5] ^ S8[x4] ^ S6[x7];
xkey[28] = S5[xC] ^ S6[xD] ^ S7[x3] ^ S8[x2] ^ S7[x8];
xkey[30] = S5[xE] ^ S6[xF] ^ S7[x1] ^ S8[x0] ^ S8[xD];
break;
case 2: // Rotation keys, rounds 0..7
xkey[1] = (S5[z8]^S6[z9]^S7[z7]^S8[z6]^S5[z2]) & 31;
xkey[3] = (S5[zA]^S6[zB]^S7[z5]^S8[z4]^S6[z6]) & 31;
xkey[5] = (S5[zC]^S6[zD]^S7[z3]^S8[z2]^S7[z9]) & 31;
xkey[7] = (S5[zE]^S6[zF]^S7[z1]^S8[z0]^S8[zC]) & 31;
xkey[9] = (S5[x3]^S6[x2]^S7[xC]^S8[xD]^S5[x8]) & 31;
xkey[11] = (S5[x1]^S6[x0]^S7[xE]^S8[xF]^S6[xD]) & 31;
xkey[13] = (S5[x7]^S6[x6]^S7[x8]^S8[x9]^S7[x3]) & 31;
xkey[15] = (S5[x5]^S6[x4]^S7[xA]^S8[xB]^S8[x7]) & 31;
break;
case 3: // Rotation keys, rounds 8..15
xkey[17] = (S5[z3]^S6[z2]^S7[zC]^S8[zD]^S5[z9]) & 31;
xkey[19] = (S5[z1]^S6[z0]^S7[zE]^S8[zF]^S6[zC]) & 31;
xkey[21] = (S5[z7]^S6[z6]^S7[z8]^S8[z9]^S7[z2]) & 31;
xkey[23] = (S5[z5]^S6[z4]^S7[zA]^S8[zB]^S8[z6]) & 31;
xkey[25] = (S5[x8]^S6[x9]^S7[x7]^S8[x6]^S5[x3]) & 31;
xkey[27] = (S5[xA]^S6[xB]^S7[x5]^S8[x4]^S6[x7]) & 31;
xkey[29] = (S5[xC]^S6[xD]^S7[x3]^S8[x2]^S7[x8]) & 31;
xkey[31] = (S5[xE]^S6[xF]^S7[x1]^S8[x0]^S8[xD]) & 31;
break;
}
}
x0123 = x4567 = x89AB = xCDEF = 0;
z0123 = z4567 = z89AB = zCDEF = 0;
#if USE_CAST5_ASSEMBLY
for (i = 0; i < 32; i += 2)
{
xkey[i + 1] = xkey[i + 1] ^ 16;
}
#endif // USE_CAST5_ASSEMBLY
}
#if !USE_CAST5_ASSEMBLY
void
__cdecl
CAST5Encrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
PGPUInt32 l, r, t;
l = (PGPUInt32) in[0]<<24 | (PGPUInt32) in[1]<<16 |
(PGPUInt32) in[2]<<8 | in[3];
r = (PGPUInt32) in[4]<<24 | (PGPUInt32) in[5]<<16 |
(PGPUInt32) in[6]<<8 | in[7];
t = F1(r, xkey, 0); l ^= G1(t);
t = F2(l, xkey, 1); r ^= G2(t);
t = F3(r, xkey, 2); l ^= G3(t);
t = F1(l, xkey, 3); r ^= G1(t);
t = F2(r, xkey, 4); l ^= G2(t);
t = F3(l, xkey, 5); r ^= G3(t);
t = F1(r, xkey, 6); l ^= G1(t);
t = F2(l, xkey, 7); r ^= G2(t);
t = F3(r, xkey, 8); l ^= G3(t);
t = F1(l, xkey, 9); r ^= G1(t);
t = F2(r, xkey, 10); l ^= G2(t);
t = F3(l, xkey, 11); r ^= G3(t);
// Stop here if only doing 12 rounds
t = F1(r, xkey, 12); l ^= G1(t);
t = F2(l, xkey, 13); r ^= G2(t);
t = F3(r, xkey, 14); l ^= G3(t);
t = F1(l, xkey, 15); r ^= G1(t);
out[0] = (PGPByte) B0(r);
out[1] = (PGPByte) B1(r);
out[2] = (PGPByte) B2(r);
out[3] = (PGPByte) B3(r);
out[4] = (PGPByte) B0(l);
out[5] = (PGPByte) B1(l);
out[6] = (PGPByte) B2(l);
out[7] = (PGPByte) B3(l);
}
void
__cdecl
CAST5Decrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
PGPUInt32 l, r, t;
r = (PGPUInt32) in[0]<<24 | (PGPUInt32) in[1]<<16 |
(PGPUInt32) in[2]<<8 | in[3];
l = (PGPUInt32) in[4]<<24 | (PGPUInt32) in[5]<<16 |
(PGPUInt32) in[6]<<8 | in[7];
t = F1(l, xkey, 15); r ^= G1(t);
t = F3(r, xkey, 14); l ^= G3(t);
t = F2(l, xkey, 13); r ^= G2(t);
t = F1(r, xkey, 12); l ^= G1(t);
// Start here if only doing 12 rounds
t = F3(l, xkey, 11); r ^= G3(t);
t = F2(r, xkey, 10); l ^= G2(t);
t = F1(l, xkey, 9); r ^= G1(t);
t = F3(r, xkey, 8); l ^= G3(t);
t = F2(l, xkey, 7); r ^= G2(t);
t = F1(r, xkey, 6); l ^= G1(t);
t = F3(l, xkey, 5); r ^= G3(t);
t = F2(r, xkey, 4); l ^= G2(t);
t = F1(l, xkey, 3); r ^= G1(t);
t = F3(r, xkey, 2); l ^= G3(t);
t = F2(l, xkey, 1); r ^= G2(t);
t = F1(r, xkey, 0); l ^= G1(t);
out[0] = (PGPByte) B0(l);
out[1] = (PGPByte) B1(l);
out[2] = (PGPByte) B2(l);
out[3] = (PGPByte) B3(l);
out[4] = (PGPByte) B0(r);
out[5] = (PGPByte) B1(r);
out[6] = (PGPByte) B2(r);
out[7] = (PGPByte) B3(r);
}
void
__cdecl
CAST5EncryptCFBdbl(
const PGPUInt32* xkey,
PGPUInt32 iv0,
PGPUInt32 iv1,
PGPUInt32 iv2,
PGPUInt32 iv3,
const PGPUInt32* src,
PGPUInt32* dest,
PGPUInt32 len)
{
PGPUInt32 iv[4] = {iv0, iv1, iv2, iv3};
while (len--)
{
Encrypt((const PGPByte*) iv, (PGPByte*) iv, xkey);
*dest++ = iv[0] ^= *src++;
*dest++ = iv[1] ^= *src++;
Encrypt((const PGPByte*) (iv+2), (PGPByte*) (iv+2), xkey);
*dest++ = iv[2] ^= *src++;
*dest++ = iv[3] ^= *src++;
}
}
void
__cdecl
CAST5DecryptCFBdbl(
const PGPUInt32* xkey,
PGPUInt32 iv0,
PGPUInt32 iv1,
PGPUInt32 iv2,
PGPUInt32 iv3,
const PGPUInt32* src,
PGPUInt32* dest,
PGPUInt32 len)
{
PGPUInt32 iv[4] = {iv0, iv1, iv2, iv3};
PGPUInt32 out[4];
while (len--)
{
Encrypt((const PGPByte*) iv, (PGPByte*) out, xkey);
*dest++ = out[0] ^ (iv[0] = *src++);
*dest++ = out[1] ^ (iv[1] = *src++);
Encrypt((const PGPByte*) (iv+2), (PGPByte*) (out+2), xkey);
*dest++ = out[2] ^ (iv[2] = *src++);
*dest++ = out[3] ^ (iv[3] = *src++);
}
}
#else // USE_CAST5_ASSEMBLY
/* NOTE: This expects the rotate amounts to be pre-incremented by 16! */
/*
* high-speed x86 CAST implementation. Optimized for Intel P5 and P6.
* (Pentium, PPro, Pentium II).
*
* The pentium is a fairly straightforward dual-issue machine with a
* number of issue restrictions (but the only false hazard that causes
* a stall is WAW). The one significant pipeline hazard is that while
* loads are complete the cycle after they execute, they require that
* the address be available the cycle before they execute. One notable
* point is that the Pentium can execute two loads in parallel, as long
* as they hit in different banks of its 8-way-interleaved L1 cache.
*
* The PPro is an out-of-order superscalar system which can dispatch 3
* instructions per cycle (subject to significant limitations) and has
* two integer and one load/store unit. After decoding x86 instructions
* to a series of R-ops, it's pretty straightforward out-of-order engine,
* executing instructions as their operands become available (register
* renaming ensures that false sharing doesn't block issue) except for
* one thing. The P6 is prone to an evil thing called a "partial register
* stall", when you write to a partial register and read a larger part.
* this overlap is resolved by retiring the partial write before the
* next instruction can be dispatched. This amounts to a *long* time.
*
* Unfortunately, precisely this operation is the best way to extract the
* bytes from the words in the CAST S-box lookup, so it's very important
* for an efficient CAST implementation.
* Fortunately, the processor has an exception built in. The
* instructions "xor reg,reg" and "sub reg,reg" sets a magic "clear"
* flag on the register, following which the merge of the high zero
* bits causes no delay.
* The only problem is that a context switch or interrupt will save
* and restore the register, and a mov does *not* set the flag.
* Thus, you should explicitly re-clear the register occasionally,
* so that the stall won't happen.
*
* Each block encrypted seems like a good value for "occasionally".
*
* Also, loop labels whould not be within 7 bytes of a 16-byte boundary.
*/
/*
* Do round n from x into y, with op1/op2/op3
* This also loads in the next round's keys into
* eax and ecx.
*
* The minimum length dependency chain is 9 cycles for
* a round, but achieving that would require issuing
* three instructions per cycle - perhaps the PPro can
* manage that. Issuing two instructions per cycle,
* a round can be done in 10 cycles. Unfortunately,
* the processor does not have enough register to use more
* parallelism than that.
*
* Dependency chain:
* op3 x,%eax/xor %edx,%edx
* rol %cl,%eax
* mov %ah,%bl/mov %al,%dl
* shr $16,%eax/XXX/AGI
* mov S1[%ebx],%ecx/mov %ah,%ebx \ Could be mult-issued
* mov S2[%edx],%edx/and $255,%eax / if we had the bandwidth
* op1 %edx,%ecx/mov S3[%ebx],%edx
* op2 %edx,%ecx/mov S4[%eax],%eax
* op3 %eax,%ecx/mov key,%eax
* xor %ecx,y/mov shift,%ecx
*
*/
/*
* This requires that %ebx be kept "clear" in the P6 sense.
*
* NOTE that the NOP *saves* a cycle on the Pentium! Without it,
* the following mov is paired with the shr, and suffers an AGI stall,
* which stalls the shr as well and thus loses two issue slots instead
* of one. We have a spare issue slot, but the shr is on the critical
* path and delaying it adds a cycle.
*/
#define ROUND(x,y,op1,op2,op3,n) \
__asm \
{ \
__asm op3 eax, x /* 1 U */ \
__asm xor edx, edx /* V */ \
__asm rol eax, cl /*2-5NP */ \
__asm mov bl, ah /* 6 U */ \
__asm mov dl, al /* V */ \
__asm shr eax, 16 /* 7 Ux */ \
__asm nop /* !!! */ /* AGI */ \
__asm mov ecx, S1[ebx*4] /* 8 U */ \
__asm mov bl, ah /* V */ \
__asm mov edx, S2[edx*4] /* 9 U */ \
__asm and eax, 255 /* V */ \
__asm op1 ecx, edx /*10 U */ \
__asm mov edx, S3[ebx*4] /* V */ \
__asm op2 ecx, edx /*11 U */ \
__asm mov edx, S4[eax*4] /* V */ \
__asm op3 ecx, edx /*12 U */ \
__asm mov eax, 8+[esi+8*n] /* V */ \
__asm xor y, ecx /*13 U */ \
__asm mov ecx, 12+[esi+8*n] /* V */ \
}
#define ROUND2(x,y,op1,op2,op3,n) \
__asm \
{ \
__asm op3 eax, x /* 1 U */ \
__asm xor edx, edx /* V */ \
__asm rol eax, cl /*2-5NP */ \
__asm mov bl, ah /* 6 U */ \
__asm mov dl, al /* V */ \
__asm shr eax, 16 /* 7 Ux */ \
__asm nop /* !!! */ /* AGI */ \
__asm mov ecx, S1[ebx*4] /* 8 U */ \
__asm mov bl, ah /* V */ \
__asm mov edx, S2[edx*4] /* 9 U */ \
__asm and eax, 255 /* V */ \
__asm op1 ecx, edx /*10 U */ \
__asm mov edx, S3[ebx*4] /* V */ \
__asm op2 ecx, edx /*11 U */ \
__asm mov edx, S4[eax*4] /* V */ \
__asm op3 ecx, edx /*12 U */ \
__asm mov eax, -8+[esi+8*n] /* V */ \
__asm xor y, ecx /*13 U */ \
__asm mov ecx, -4+[esi+8*n] /* V */ \
}
/* void
* CAST5encrypt(byte const *in, byte *out, word32 const *xkey)
* 4 8 12
*
* Register usage:
* esi points to key schedule
* eax is used as a temporary, and used for the primary round subkey
* ebx is all-zero in the high 24 bits, and is used for indexing
* ecx is used as a temporary, and for the rotate round subkey (PLUS 16)
* edx is all-zero in the high 24 bits, and is used for indexing
* esi points to the key schedule
* edi is the right half of the current block
* ebp is the left half of the current block
*/
#define left ebp
#define right edi
__declspec(naked)
void
__cdecl
CAST5Encrypt(const PGPByte *in, PGPByte *out, const PGPUInt32 *xkey)
{
__asm
{
ALIGN 16
push esi /* U */
push right /* V */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -