📄 integer.cpp
字号:
CRYPTOPP_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
AS2( xor eax, eax) // clear eax
AS2( sub eax, esi) // eax is a negative index from end of B
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
AS1( jz loopendAdd) // if no dwords then nothing to do
AS1(loopstartAdd:)
AS2( mov esi,[edx]) // load lower word of A
AS2( mov ebp,[edx+4]) // load higher word of A
AS2( mov edi,[ebx+8*eax]) // load lower word of B
AS2( lea edx,[edx+8]) // advance A and C
AS2( adc esi,edi) // add lower words
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
AS2( adc ebp,edi) // add higher words
AS1( inc eax) // advance B
AS2( mov [edx+ecx-8],esi) // store lower word result
AS2( mov [edx+ecx-4],ebp) // store higher word result
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
AS1(loopendAdd:)
AS2( adc eax, 0) // store carry into eax (return result register)
AddEpilogue
}
CRYPTOPP_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we can add this to A to get C
AS2( xor eax, eax) // clear eax
AS2( sub eax, esi) // eax is a negative index from end of B
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
AS2( sar eax, 1) // unit of eax is now dwords; this also clears the carry flag
AS1( jz loopendSub) // if no dwords then nothing to do
AS1(loopstartSub:)
AS2( mov esi,[edx]) // load lower word of A
AS2( mov ebp,[edx+4]) // load higher word of A
AS2( mov edi,[ebx+8*eax]) // load lower word of B
AS2( lea edx,[edx+8]) // advance A and C
AS2( sbb esi,edi) // subtract lower words
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
AS2( sbb ebp,edi) // subtract higher words
AS1( inc eax) // advance B
AS2( mov [edx+ecx-8],esi) // store lower word result
AS2( mov [edx+ecx-4],ebp) // store higher word result
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
AS1(loopendSub:)
AS2( adc eax, 0) // store carry into eax (return result register)
AddEpilogue
}
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax)
AS1( neg esi)
AS1( jz loopendAddP4) // if no dwords then nothing to do
AS2( mov edi, [edx])
AS2( mov ebp, [ebx])
AS1( jmp carry1AddP4)
AS1(loopstartAddP4:)
AS2( mov edi, [edx+8])
AS2( add ecx, 8)
AS2( add edx, 8)
AS2( mov ebp, [ebx])
AS2( add edi, eax)
AS1( jc carry1AddP4)
AS2( xor eax, eax)
AS1(carry1AddP4:)
AS2( add edi, ebp)
AS2( mov ebp, 1)
AS2( mov [ecx], edi)
AS2( mov edi, [edx+4])
AS2( cmovc eax, ebp)
AS2( mov ebp, [ebx+4])
AS2( add ebx, 8)
AS2( add edi, eax)
AS1( jc carry2AddP4)
AS2( xor eax, eax)
AS1(carry2AddP4:)
AS2( add edi, ebp)
AS2( mov ebp, 1)
AS2( cmovc eax, ebp)
AS2( mov [ecx+4], edi)
AS2( add esi, 2)
AS1( jnz loopstartAddP4)
AS1(loopendAddP4:)
AddEpilogue
}
CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax)
AS1( neg esi)
AS1( jz loopendSubP4) // if no dwords then nothing to do
AS2( mov edi, [edx])
AS2( mov ebp, [ebx])
AS1( jmp carry1SubP4)
AS1(loopstartSubP4:)
AS2( mov edi, [edx+8])
AS2( add edx, 8)
AS2( add ecx, 8)
AS2( mov ebp, [ebx])
AS2( sub edi, eax)
AS1( jc carry1SubP4)
AS2( xor eax, eax)
AS1(carry1SubP4:)
AS2( sub edi, ebp)
AS2( mov ebp, 1)
AS2( mov [ecx], edi)
AS2( mov edi, [edx+4])
AS2( cmovc eax, ebp)
AS2( mov ebp, [ebx+4])
AS2( add ebx, 8)
AS2( sub edi, eax)
AS1( jc carry2SubP4)
AS2( xor eax, eax)
AS1(carry2SubP4:)
AS2( sub edi, ebp)
AS2( mov ebp, 1)
AS2( cmovc eax, ebp)
AS2( mov [ecx+4], edi)
AS2( add esi, 2)
AS1( jnz loopstartSubP4)
AS1(loopendSubP4:)
AddEpilogue
}
// multiply assembly code originally contributed by Leonard Janke
#define MulStartup \
AS2(xor ebp, ebp) \
AS2(xor edi, edi) \
AS2(xor ebx, ebx)
#define MulShiftCarry \
AS2(mov ebp, edx) \
AS2(mov edi, ebx) \
AS2(xor ebx, ebx)
#define MulAccumulateBottom(i,j) \
AS2(mov eax, [ecx+4*j]) \
AS2(imul eax, dword ptr [esi+4*i]) \
AS2(add ebp, eax)
#define MulAccumulate(i,j) \
AS2(mov eax, [ecx+4*j]) \
AS1(mul dword ptr [esi+4*i]) \
AS2(add ebp, eax) \
AS2(adc edi, edx) \
AS2(adc bl, bh)
#define MulStoreDigit(i) \
AS2(mov edx, edi) \
AS2(mov edi, [esp]) \
AS2(mov [edi+4*i], ebp)
#define MulLastDiagonal(digits) \
AS2(mov eax, [ecx+4*(digits-1)]) \
AS1(mul dword ptr [esi+4*(digits-1)]) \
AS2(add ebp, eax) \
AS2(adc edx, edi) \
AS2(mov edi, [esp]) \
AS2(mov [edi+4*(2*digits-2)], ebp) \
AS2(mov [edi+4*(2*digits-1)], edx)
CRYPTOPP_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
{
MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(3,2)
MulAccumulate(2,3)
MulStoreDigit(5)
MulShiftCarry
MulLastDiagonal(4)
MulEpilogue
}
CRYPTOPP_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
{
MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(4,0)
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulAccumulate(0,4)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(5,0)
MulAccumulate(4,1)
MulAccumulate(3,2)
MulAccumulate(2,3)
MulAccumulate(1,4)
MulAccumulate(0,5)
MulStoreDigit(5)
MulShiftCarry
MulAccumulate(6,0)
MulAccumulate(5,1)
MulAccumulate(4,2)
MulAccumulate(3,3)
MulAccumulate(2,4)
MulAccumulate(1,5)
MulAccumulate(0,6)
MulStoreDigit(6)
MulShiftCarry
MulAccumulate(7,0)
MulAccumulate(6,1)
MulAccumulate(5,2)
MulAccumulate(4,3)
MulAccumulate(3,4)
MulAccumulate(2,5)
MulAccumulate(1,6)
MulAccumulate(0,7)
MulStoreDigit(7)
MulShiftCarry
MulAccumulate(7,1)
MulAccumulate(6,2)
MulAccumulate(5,3)
MulAccumulate(4,4)
MulAccumulate(3,5)
MulAccumulate(2,6)
MulAccumulate(1,7)
MulStoreDigit(8)
MulShiftCarry
MulAccumulate(7,2)
MulAccumulate(6,3)
MulAccumulate(5,4)
MulAccumulate(4,5)
MulAccumulate(3,6)
MulAccumulate(2,7)
MulStoreDigit(9)
MulShiftCarry
MulAccumulate(7,3)
MulAccumulate(6,4)
MulAccumulate(5,5)
MulAccumulate(4,6)
MulAccumulate(3,7)
MulStoreDigit(10)
MulShiftCarry
MulAccumulate(7,4)
MulAccumulate(6,5)
MulAccumulate(5,6)
MulAccumulate(4,7)
MulStoreDigit(11)
MulShiftCarry
MulAccumulate(7,5)
MulAccumulate(6,6)
MulAccumulate(5,7)
MulStoreDigit(12)
MulShiftCarry
MulAccumulate(7,6)
MulAccumulate(6,7)
MulStoreDigit(13)
MulShiftCarry
MulLastDiagonal(8)
MulEpilogue
}
CRYPTOPP_NAKED void PentiumOptimized::Multiply8Bottom(word* Z, const word* X, const word* Y)
{
MulPrologue
// now: [esp] = Z, esi = X, ecx = Y
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(4,0)
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulAccumulate(0,4)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(5,0)
MulAccumulate(4,1)
MulAccumulate(3,2)
MulAccumulate(2,3)
MulAccumulate(1,4)
MulAccumulate(0,5)
MulStoreDigit(5)
MulShiftCarry
MulAccumulate(6,0)
MulAccumulate(5,1)
MulAccumulate(4,2)
MulAccumulate(3,3)
MulAccumulate(2,4)
MulAccumulate(1,5)
MulAccumulate(0,6)
MulStoreDigit(6)
MulShiftCarry
MulAccumulateBottom(7,0)
MulAccumulateBottom(6,1)
MulAccumulateBottom(5,2)
MulAccumulateBottom(4,3)
MulAccumulateBottom(3,4)
MulAccumulateBottom(2,5)
MulAccumulateBottom(1,6)
MulAccumulateBottom(0,7)
MulStoreDigit(7)
MulEpilogue
}
#undef AS1
#undef AS2
#else // not x86 - no processor specific code at this layer
typedef Portable LowLevel;
#endif
#ifdef SSE2_INTRINSICS_AVAILABLE
#ifdef __GNUC__
#define CRYPTOPP_FASTCALL
#else
#define CRYPTOPP_FASTCALL __fastcall
#endif
static void CRYPTOPP_FASTCALL P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
{
__m128i a3210 = _mm_load_si128(A);
__m128i b3210 = _mm_load_si128(B);
__m128i sum;
__m128i z = _mm_setzero_si128();
__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
C[0] = a2b2_a0b0;
__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
C[1] = _mm_add_epi64(a1b0, a0b1);
__m128i a31 = _mm_srli_epi64(a3210, 32);
__m128i b31 = _mm_srli_epi64(b3210, 32);
__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
C[6] = a3b3_a1b1;
__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
sum = _mm_add_epi64(a1b1, a0b2);
C[2] = _mm_add_epi64(sum, a2b0);
__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
sum = _mm_add_epi64(a2b1, a0b3);
C[3] = _mm_add_epi64(sum, sum1);
__m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
sum = _mm_add_epi64(a2b2, a3b1);
C[4] = _mm_add_epi64(sum, a1b3);
__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
C[5] = _mm_add_epi64(a3b2, a2b3);
}
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
{
__m128i temp[7];
const word *w = (word *)temp;
const __m64 *mw = (__m64 *)w;
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
C[0] = w[0];
__m64 s1, s2;
__m64 w1 = _mm_cvtsi32_si64(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
__m64 w10 = mw[5];
__m64 w12 = mw[6];
__m64 w14 = mw[7];
__m64 w16 = mw[8];
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
__m64 w26 = _mm_cvtsi32_si64(w[26]);
s1 = _mm_add_si64(w1, w4);
C[1] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
C[2] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
C[3] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s2);
C[4] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w18, w20);
s1 = _mm_add_si64(s1, s2);
C[5] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
s2 = _mm_add_si64(w22, w26);
s1 = _mm_add_si64(s1, s2);
C[6] = _mm_cvtsi64_si32(s1);
s1 = _mm_srli_si64(s1, 32);
C[7] = _mm_cvtsi64_si32(s1) + w[27];
_mm_empty();
}
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
{
__m128i temp[28];
const word *w = (word *)temp;
const __m64 *mw = (__m64 *)w;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -