📄 cast5.cpp
字号:
* high-speed x86 CAST implementation. Optimized for Intel P5 and P6.
* (Pentium, PPro, Pentium II).
*
* The pentium is a fairly straightforward dual-issue machine with a
* number of issue restrictions (but the only false hazard that causes
* a stall is WAW). The one significant pipeline hazard is that while
* loads are complete the cycle after they execute, they require that
* the address be available the cycle before they execute. One notable
* point is that the Pentium can execute two loads in parallel, as long
* as they hit in different banks of its 8-way-interleaved L1 cache.
*
* The PPro is an out-of-order superscalar system which can dispatch 3
* instructions per cycle (subject to significant limitations) and has
* two integer and one load/store unit. After decoding x86 instructions
* to a series of R-ops, it's pretty straightforward out-of-order engine,
* executing instructions as their operands become available (register
* renaming ensures that false sharing doesn't block issue) except for
* one thing. The P6 is prone to an evil thing called a "partial register
* stall", when you write to a partial register and read a larger part.
* this overlap is resolved by retiring the partial write before the
* next instruction can be dispatched. This amounts to a *long* time.
*
* Unfortunately, precisely this operation is the best way to extract the
* bytes from the words in the CAST S-box lookup, so it's very important
* for an efficient CAST implementation.
* Fortunately, the processor has an exception built in. The
* instructions "xor reg,reg" and "sub reg,reg" sets a magic "clear"
* flag on the register, following which the merge of the high zero
* bits causes no delay.
* The only problem is that a context switch or interrupt will save
* and restore the register, and a mov does *not* set the flag.
* Thus, you should explicitly re-clear the register occasionally,
* so that the stall won't happen.
*
* Each block encrypted seems like a good value for "occasionally".
*
* Also, loop labels whould not be within 7 bytes of a 16-byte boundary.
*/
/*
* Do round n from x into y, with op1/op2/op3
* This also loads in the next round's keys into
* eax and ecx.
*
* The minimum length dependency chain is 9 cycles for
* a round, but achieving that would require issuing
* three instructions per cycle - perhaps the PPro can
* manage that. Issuing two instructions per cycle,
* a round can be done in 10 cycles. Unfortunately,
* the processor does not have enough register to use more
* parallelism than that.
*
* Dependency chain:
* op3 x,%eax/xor %edx,%edx
* rol %cl,%eax
* mov %ah,%bl/mov %al,%dl
* shr $16,%eax/XXX/AGI
* mov S1[%ebx],%ecx/mov %ah,%ebx \ Could be mult-issued
* mov S2[%edx],%edx/and $255,%eax / if we had the bandwidth
* op1 %edx,%ecx/mov S3[%ebx],%edx
* op2 %edx,%ecx/mov S4[%eax],%eax
* op3 %eax,%ecx/mov key,%eax
* xor %ecx,y/mov shift,%ecx
*
*/
/*
* This requires that %ebx be kept "clear" in the P6 sense.
*
* NOTE that the NOP *saves* a cycle on the Pentium! Without it,
* the following mov is paired with the shr, and suffers an AGI stall,
* which stalls the shr as well and thus loses two issue slots instead
* of one. We have a spare issue slot, but the shr is on the critical
* path and delaying it adds a cycle.
*/
#define ROUND(x,y,op1,op2,op3,n) \
__asm \
{ \
__asm op3 eax, x /* 1 U */ \
__asm xor edx, edx /* V */ \
__asm rol eax, cl /*2-5NP */ \
__asm mov bl, ah /* 6 U */ \
__asm mov dl, al /* V */ \
__asm shr eax, 16 /* 7 Ux */ \
__asm nop /* !!! */ /* AGI */ \
__asm mov ecx, S1[ebx*4] /* 8 U */ \
__asm mov bl, ah /* V */ \
__asm mov edx, S2[edx*4] /* 9 U */ \
__asm and eax, 255 /* V */ \
__asm op1 ecx, edx /*10 U */ \
__asm mov edx, S3[ebx*4] /* V */ \
__asm op2 ecx, edx /*11 U */ \
__asm mov edx, S4[eax*4] /* V */ \
__asm op3 ecx, edx /*12 U */ \
__asm mov eax, 8+[esi+8*n] /* V */ \
__asm xor y, ecx /*13 U */ \
__asm mov ecx, 12+[esi+8*n] /* V */ \
}
/* void
* CAST5encrypt(byte const *in, byte *out, PGPUInt32 const *xkey)
* 4 8 12
*
* Register usage:
* esi points to key schedule
* eax is used as a temporary, and used for the primary round subkey
* ebx is all-zero in the high 24 bits, and is used for indexing
* ecx is used as a temporary, and for the rotate round subkey (PLUS 16)
* edx is all-zero in the high 24 bits, and is used for indexing
* esi points to the key schedule
* edi is the right half of the current block
* ebp is the left half of the current block
*/
#define left ebp
#define right edi
NAKED
void
__cdecl
CAST5encryptAsm(const PGPUInt8 *in, PGPUInt8 *out, const PGPUInt32 *xkey)
{
__asm
{
ALIGN 16
push esi /* U */
push right /* V */
mov right, 8+[esp+4] /* U */
mov esi, 8+[esp+12] /* V */
push left /* U */
push ebx /* V */
mov left, [right] /* U */
mov right, [right+4] /* V */
bswap right /* NP */
bswap left /* NP */
mov eax, [esi] /* U */
xor ebx,ebx /* V */
mov ecx, [esi+4] /* U */
ROUND(right, left, xor, sub, add, 0)
ROUND(left, right, sub, add, xor, 1)
ROUND(right, left, add, xor, sub, 2)
ROUND(left, right, xor, sub, add, 3)
ROUND(right, left, sub, add, xor, 4)
ROUND(left, right, add, xor, sub, 5)
ROUND(right, left, xor, sub, add, 6)
ROUND(left, right, sub, add, xor, 7)
ROUND(right, left, add, xor, sub, 8)
ROUND(left, right, xor, sub, add, 9)
ROUND(right, left, sub, add, xor,10)
ROUND(left, right, add, xor, sub,11)
ROUND(right, left, xor, sub, add,12)
ROUND(left, right, sub, add, xor,13)
ROUND(right, left, add, xor, sub,14)
/* ROUND(left, right, xor, sub, add,15)
* Last round: omit loading of keys for next round
* Fetch out pointer and store data there instead
*/
add eax, left /* 1 U */
xor edx, edx /* V */
rol eax, cl /*2-5NP */
mov bl, ah /* 6 U */
mov dl, al /* V */
shr eax, 16 /* 7 Ux */
mov esi, 16+[esp+8] /* V */
mov ecx, S1[ebx*4] /* 8 U */
mov bl, ah /* V */
mov edx, S2[edx*4] /* 9 U */
and eax, 255 /* V */
xor ecx, edx /*10 U */
mov edx, S3[ebx*4] /* V */
sub ecx, edx /*11 U */
mov eax, S4[eax*4] /* V */
add ecx, eax /*12 U */
pop ebx /* V */
bswap left /*13 NP */
xor right, ecx /*14 U */
mov [esi+4], left /* V */
bswap right /* NP */
pop left /* U */
mov [esi], right /* V */
pop right /* U */
pop esi /* V */
ret /* NP */
}
}
/*
* asm void
* CAST5encryptCFBdbl(
* register PGPUInt32 const *xkey, // esp+pushes+ 4
* register PGPUInt32 in0, // esp+pushes+ 8
* register PGPUInt32 in1, // esp+pushes+12
* register PGPUInt32 in2, // esp+pushes+16
* register PGPUInt32 in3, // esp+pushes+20
* register PGPUInt32 const *src, // esp+pushes+24
* register PGPUInt32 *dest, // esp+pushes+28
* register PGPUInt32 len) // esp+pushes+32
*
* Note that "len" is the number of 16-byte units to encrypt.
* Since this function only encrypts one block per time
* around the loop, it has to be doubled.
*
* Doing the dbl part...
* We use the srgument slots on the stack as IV registers, but
* actually only use one IV, in the in2/in3 slots.
* Each iteration, we fetch from them the data to be encrypted
* for the next iteration before storing the current
* ciphertext for the ineration after the next, thus achieving the
* necessary interleaving.
*/
#define left ebp
#define right edi
NAKED
void
__cdecl
CAST5encryptCFBdblAsm(
const PGPUInt32 *xkey,
PGPUInt32 iv0,
PGPUInt32 iv1,
PGPUInt32 iv2,
PGPUInt32 iv3,
const PGPUInt32 *src,
PGPUInt32 *dest,
PGPUInt32 len)
{
__asm
{
ALIGN 16
push esi
push right
mov esi, 8+[esp+4] /* U - load key schedule pointer */
push left /* V */
mov left, 12+[esp+8] /* U - load in0 as left */
mov right, 12+[esp+12] /* V - load in1 as right */
push ebx /* U */
xor ebx,ebx /* V */
shl dword ptr 16+[esp+32], 1 /* NP - double loop counter */
encryptloop:
mov eax, [esi] /* U - preload key material */
mov ecx, [esi+4] /* V - preload key material */
bswap right /* NP */
bswap left /* NP */
ROUND(right, left, xor, sub, add, 0)
ROUND(left, right, sub, add, xor, 1)
ROUND(right, left, add, xor, sub, 2)
ROUND(left, right, xor, sub, add, 3)
ROUND(right, left, sub, add, xor, 4)
ROUND(left, right, add, xor, sub, 5)
ROUND(right, left, xor, sub, add, 6)
ROUND(left, right, sub, add, xor, 7)
ROUND(right, left, add, xor, sub, 8)
ROUND(left, right, xor, sub, add, 9)
ROUND(right, left, sub, add, xor,10)
ROUND(left, right, add, xor, sub,11)
ROUND(right, left, xor, sub, add,12)
ROUND(left, right, sub, add, xor,13)
ROUND(right, left, add, xor, sub,14)
/* ROUND(left, right, xor, sub, add,15)
* Last round: omit loading of keys for next round
* Instead, start the CFB operations. Including the
* swap of the halves, that ends up as:
*
* %eax = src[0] ^ bswap(right)
* %ecx = src[1] ^ bswap(left)
* src += 8bytes
* left = bswap(in2)
* right = bswap(in3)
* in2 = %eax
* dest[0] = %eax
* in3 = %ecx
* dest[1] = %eax
* dest += 8bytes
*
* For the '486, we can just use bswap. For the '386, it's
* xchg ah,al
* rol $16,eax
* xchg ah,al
*
* Annoyingly, this really *sucks* on a PPro, due to fierce
* partial-register stalls. So it pretty much has to go two ways.
* Options are:
* - Duplicate the entire encryption loop?
* - Do it in a pre-pass and a post-pass. Makes life easy, but we
* end up being memory bound.
* - Something truly wierd?
*/
add eax, left /* 1 U */
xor edx, edx /* V */
rol eax, cl /*2-5NP */
mov bl, ah /* 6 U */
mov dl, al /* V */
shr eax, 16 /* 7 Ux */
nop /* V */
mov ecx, S1[ebx*4] /* 8 U */
mov bl, ah /* V */
mov edx, S2[edx*4] /* 9 U */
and eax, 255 /* V */
xor ecx, edx /*10 U */
mov edx, S3[ebx*4] /* V */
sub ecx, edx /*11 U */
mov edx, S4[eax*4] /* V */
add ecx, edx /*12 U */
mov ebx, 16+[esp+24] /* V - fetch src pointer */
xor right, ecx /*13 U */
add ebx, 8 /* V - increment src ptr */
bswap left
bswap right
mov eax, [ebx-8] /* U - get src word */
mov ecx, [ebx-4] /* V - other src word */
mov 16+[esp+24], ebx /* U - store src pointer back */
mov edx, 16+[esp+28] /* V - fetch dest pointer */
xor eax, right /* U */
xor ecx, left /* V */
mov left, 16+[esp+16] /* U - fetch in2 for new left */
mov right, 16+[esp+20] /* V - fetch in3 for new right */
mov 16+[esp+16], eax /* U - store ciphertext for next time */
mov 16+[esp+20], ecx /* V - store ciphertext for next time */
mov [edx+0], eax /* U - store result */
mov [edx+4], ecx /* V - store result */
add edx, 8 /* U - increment dest ptr */
xor ebx, ebx /* V - clear %ebx for next iteration */
dec dword ptr 16+[esp+32] /* U - decrement loop counter (set ZF) */
mov 16+[esp+28], edx /* V - store dest pointer back */
/* Pairing opportunity lost, sigh */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -