📄 integer.cpp
字号:
MulAcc(1, 5);
MulAcc(2, 4);
MulAcc(3, 3);
MulAcc(4, 2);
MulAcc(5, 1);
MulAcc(6, 0);
R[6] = c;
R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
}
#undef MulAcc
#undef SaveMulAcc
#undef SquAcc
#undef SaveSquAcc
// CodeWarrior defines _MSC_VER
#if defined(_MSC_VER) && !defined(__MWERKS__) && defined(_M_IX86) && (_M_IX86<=700)
class PentiumOptimized : public Portable
{
public:
static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
static inline void Square4(word *R, const word *A)
{
// VC60 workaround: MSVC 6.0 has an optimization bug that makes
// (dword)A*B where either A or B has been cast to a dword before
// very expensive. Revisit this function when this
// bug is fixed.
Multiply4(R, A, A);
}
};
typedef PentiumOptimized LowLevel;
__declspec(naked) word __fastcall PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
__asm
{
push ebp
push ebx
push esi
push edi
mov esi, [esp+24] ; N
mov ebx, [esp+20] ; B
// now: ebx = B, ecx = C, edx = A, esi = N
sub ecx, edx // hold the distance between C & A so we can add this to A to get C
xor eax, eax // clear eax
sub eax, esi // eax is a negative index from end of B
lea ebx, [ebx+4*esi] // ebx is end of B
sar eax, 1 // unit of eax is now dwords; this also clears the carry flag
jz loopend // if no dwords then nothing to do
loopstart:
mov esi,[edx] // load lower word of A
mov ebp,[edx+4] // load higher word of A
mov edi,[ebx+8*eax] // load lower word of B
lea edx,[edx+8] // advance A and C
adc esi,edi // add lower words
mov edi,[ebx+8*eax+4] // load higher word of B
adc ebp,edi // add higher words
inc eax // advance B
mov [edx+ecx-8],esi // store lower word result
mov [edx+ecx-4],ebp // store higher word result
jnz loopstart // loop until eax overflows and becomes zero
loopend:
adc eax, 0 // store carry into eax (return result register)
pop edi
pop esi
pop ebx
pop ebp
ret 8
}
}
__declspec(naked) word __fastcall PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
__asm
{
push ebp
push ebx
push esi
push edi
mov esi, [esp+24] ; N
mov ebx, [esp+20] ; B
sub ecx, edx
xor eax, eax
sub eax, esi
lea ebx, [ebx+4*esi]
sar eax, 1
jz loopend
loopstart:
mov esi,[edx]
mov ebp,[edx+4]
mov edi,[ebx+8*eax]
lea edx,[edx+8]
sbb esi,edi
mov edi,[ebx+8*eax+4]
sbb ebp,edi
inc eax
mov [edx+ecx-8],esi
mov [edx+ecx-4],ebp
jnz loopstart
loopend:
adc eax, 0
pop edi
pop esi
pop ebx
pop ebp
ret 8
}
}
#ifdef SSE2_INTRINSICS_AVAILABLE
static bool GetSSE2Capability()
{
word32 b;
__asm
{
mov eax, 1
cpuid
mov b, edx
}
return (b & (1 << 26)) != 0;
}
bool g_sse2DetectionDone = false, g_sse2Detected, g_sse2Enabled = true;
static inline bool HasSSE2()
{
if (g_sse2Enabled && !g_sse2DetectionDone)
{
g_sse2Detected = GetSSE2Capability();
g_sse2DetectionDone = true;
}
return g_sse2Enabled && g_sse2Detected;
}
class P4Optimized : public PentiumOptimized
{
public:
static word __fastcall Add(word *C, const word *A, const word *B, unsigned int N);
static word __fastcall Subtract(word *C, const word *A, const word *B, unsigned int N);
static void Multiply4(word *C, const word *A, const word *B);
static void Multiply8(word *C, const word *A, const word *B);
static inline void Square4(word *R, const word *A)
{
Multiply4(R, A, A);
}
static void Multiply8Bottom(word *C, const word *A, const word *B);
};
static void __fastcall P4_Mul(__m128i *C, const __m128i *A, const __m128i *B)
{
__m128i a3210 = _mm_load_si128(A);
__m128i b3210 = _mm_load_si128(B);
__m128i sum;
__m128i z = _mm_setzero_si128();
__m128i a2b2_a0b0 = _mm_mul_epu32(a3210, b3210);
C[0] = a2b2_a0b0;
__m128i a3120 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(3, 1, 2, 0));
__m128i b3021 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 2, 1));
__m128i a1b0_a0b1 = _mm_mul_epu32(a3120, b3021);
__m128i a1b0 = _mm_unpackhi_epi32(a1b0_a0b1, z);
__m128i a0b1 = _mm_unpacklo_epi32(a1b0_a0b1, z);
C[1] = _mm_add_epi64(a1b0, a0b1);
__m128i a31 = _mm_srli_epi64(a3210, 32);
__m128i b31 = _mm_srli_epi64(b3210, 32);
__m128i a3b3_a1b1 = _mm_mul_epu32(a31, b31);
C[6] = a3b3_a1b1;
__m128i a1b1 = _mm_unpacklo_epi32(a3b3_a1b1, z);
__m128i b3012 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(3, 0, 1, 2));
__m128i a2b0_a0b2 = _mm_mul_epu32(a3210, b3012);
__m128i a0b2 = _mm_unpacklo_epi32(a2b0_a0b2, z);
__m128i a2b0 = _mm_unpackhi_epi32(a2b0_a0b2, z);
sum = _mm_add_epi64(a1b1, a0b2);
C[2] = _mm_add_epi64(sum, a2b0);
__m128i a2301 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(2, 3, 0, 1));
__m128i b2103 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(2, 1, 0, 3));
__m128i a3b0_a1b2 = _mm_mul_epu32(a2301, b3012);
__m128i a2b1_a0b3 = _mm_mul_epu32(a3210, b2103);
__m128i a3b0 = _mm_unpackhi_epi32(a3b0_a1b2, z);
__m128i a1b2 = _mm_unpacklo_epi32(a3b0_a1b2, z);
__m128i a2b1 = _mm_unpackhi_epi32(a2b1_a0b3, z);
__m128i a0b3 = _mm_unpacklo_epi32(a2b1_a0b3, z);
__m128i sum1 = _mm_add_epi64(a3b0, a1b2);
sum = _mm_add_epi64(a2b1, a0b3);
C[3] = _mm_add_epi64(sum, sum1);
__m128i a3b1_a1b3 = _mm_mul_epu32(a2301, b2103);
__m128i a2b2 = _mm_unpackhi_epi32(a2b2_a0b0, z);
__m128i a3b1 = _mm_unpackhi_epi32(a3b1_a1b3, z);
__m128i a1b3 = _mm_unpacklo_epi32(a3b1_a1b3, z);
sum = _mm_add_epi64(a2b2, a3b1);
C[4] = _mm_add_epi64(sum, a1b3);
__m128i a1302 = _mm_shuffle_epi32(a3210, _MM_SHUFFLE(1, 3, 0, 2));
__m128i b1203 = _mm_shuffle_epi32(b3210, _MM_SHUFFLE(1, 2, 0, 3));
__m128i a3b2_a2b3 = _mm_mul_epu32(a1302, b1203);
__m128i a3b2 = _mm_unpackhi_epi32(a3b2_a2b3, z);
__m128i a2b3 = _mm_unpacklo_epi32(a3b2_a2b3, z);
C[5] = _mm_add_epi64(a3b2, a2b3);
}
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
{
__m128i temp[7];
const word *w = (word *)temp;
const __m64 *mw = (__m64 *)w;
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
C[0] = w[0];
__m64 s1, s2;
__m64 w1 = _m_from_int(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
__m64 w10 = mw[5];
__m64 w12 = mw[6];
__m64 w14 = mw[7];
__m64 w16 = mw[8];
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
__m64 w26 = _m_from_int(w[26]);
s1 = _mm_add_si64(w1, w4);
C[1] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
C[2] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
C[3] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s2);
C[4] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w18, w20);
s1 = _mm_add_si64(s1, s2);
C[5] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w22, w26);
s1 = _mm_add_si64(s1, s2);
C[6] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
C[7] = _m_to_int(s1) + w[27];
_mm_empty();
}
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
{
__m128i temp[28];
const word *w = (word *)temp;
const __m64 *mw = (__m64 *)w;
const word *x = (word *)temp+7*4;
const __m64 *mx = (__m64 *)x;
const word *y = (word *)temp+7*4*2;
const __m64 *my = (__m64 *)y;
const word *z = (word *)temp+7*4*3;
const __m64 *mz = (__m64 *)z;
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
C[0] = w[0];
__m64 s1, s2, s3, s4;
__m64 w1 = _m_from_int(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
__m64 w10 = mw[5];
__m64 w12 = mw[6];
__m64 w14 = mw[7];
__m64 w16 = mw[8];
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
__m64 w26 = _m_from_int(w[26]);
__m64 w27 = _m_from_int(w[27]);
__m64 x0 = _m_from_int(x[0]);
__m64 x1 = _m_from_int(x[1]);
__m64 x4 = mx[2];
__m64 x6 = mx[3];
__m64 x8 = mx[4];
__m64 x10 = mx[5];
__m64 x12 = mx[6];
__m64 x14 = mx[7];
__m64 x16 = mx[8];
__m64 x18 = mx[9];
__m64 x20 = mx[10];
__m64 x22 = mx[11];
__m64 x26 = _m_from_int(x[26]);
__m64 x27 = _m_from_int(x[27]);
__m64 y0 = _m_from_int(y[0]);
__m64 y1 = _m_from_int(y[1]);
__m64 y4 = my[2];
__m64 y6 = my[3];
__m64 y8 = my[4];
__m64 y10 = my[5];
__m64 y12 = my[6];
__m64 y14 = my[7];
__m64 y16 = my[8];
__m64 y18 = my[9];
__m64 y20 = my[10];
__m64 y22 = my[11];
__m64 y26 = _m_from_int(y[26]);
__m64 y27 = _m_from_int(y[27]);
__m64 z0 = _m_from_int(z[0]);
__m64 z1 = _m_from_int(z[1]);
__m64 z4 = mz[2];
__m64 z6 = mz[3];
__m64 z8 = mz[4];
__m64 z10 = mz[5];
__m64 z12 = mz[6];
__m64 z14 = mz[7];
__m64 z16 = mz[8];
__m64 z18 = mz[9];
__m64 z20 = mz[10];
__m64 z22 = mz[11];
__m64 z26 = _m_from_int(z[26]);
s1 = _mm_add_si64(w1, w4);
C[1] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
C[2] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
C[3] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x0, y0);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s3);
s1 = _mm_add_si64(s1, s2);
C[4] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x1, y1);
s4 = _mm_add_si64(x4, y4);
s1 = _mm_add_si64(s1, w18);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w20);
s1 = _mm_add_si64(s1, s3);
C[5] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x6, y6);
s4 = _mm_add_si64(x8, y8);
s1 = _mm_add_si64(s1, w22);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w26);
s1 = _mm_add_si64(s1, s3);
C[6] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x10, y10);
s4 = _mm_add_si64(x12, y12);
s1 = _mm_add_si64(s1, w27);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, s3);
C[7] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x14, y14);
s4 = _mm_add_si64(x16, y16);
s1 = _mm_add_si64(s1, z0);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, s3);
C[8] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x18, y18);
s4 = _mm_add_si64(x20, y20);
s1 = _mm_add_si64(s1, z1);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, z4);
s1 = _mm_add_si64(s1, s3);
C[9] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x22, y22);
s4 = _mm_add_si64(x26, y26);
s1 = _mm_add_si64(s1, z6);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, z8);
s1 = _mm_add_si64(s1, s3);
C[10] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x27, y27);
s1 = _mm_add_si64(s1, z10);
s1 = _mm_add_si64(s1, z12);
s1 = _mm_add_si64(s1, s3);
C[11] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(z14, z16);
s1 = _mm_add_si64(s1, s3);
C[12] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(z18, z20);
s1 = _mm_add_si64(s1, s3);
C[13] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(z22, z26);
s1 = _mm_add_si64(s1, s3);
C[14] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
C[15] = z[27] + _m_to_int(s1);
_mm_empty();
}
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
{
__m128i temp[21];
const word *w = (word *)temp;
const __m64 *mw = (__m64 *)w;
const word *x = (word *)temp+7*4;
const __m64 *mx = (__m64 *)x;
const word *y = (word *)temp+7*4*2;
const __m64 *my = (__m64 *)y;
P4_Mul(temp, (__m128i *)A, (__m128i *)B);
P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
C[0] = w[0];
__m64 s1, s2, s3, s4;
__m64 w1 = _m_from_int(w[1]);
__m64 w4 = mw[2];
__m64 w6 = mw[3];
__m64 w8 = mw[4];
__m64 w10 = mw[5];
__m64 w12 = mw[6];
__m64 w14 = mw[7];
__m64 w16 = mw[8];
__m64 w18 = mw[9];
__m64 w20 = mw[10];
__m64 w22 = mw[11];
__m64 w26 = _m_from_int(w[26]);
__m64 x0 = _m_from_int(x[0]);
__m64 x1 = _m_from_int(x[1]);
__m64 x4 = mx[2];
__m64 x6 = mx[3];
__m64 x8 = mx[4];
__m64 y0 = _m_from_int(y[0]);
__m64 y1 = _m_from_int(y[1]);
__m64 y4 = my[2];
__m64 y6 = my[3];
__m64 y8 = my[4];
s1 = _mm_add_si64(w1, w4);
C[1] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w6, w8);
s1 = _mm_add_si64(s1, s2);
C[2] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s2 = _mm_add_si64(w10, w12);
s1 = _mm_add_si64(s1, s2);
C[3] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x0, y0);
s2 = _mm_add_si64(w14, w16);
s1 = _mm_add_si64(s1, s3);
s1 = _mm_add_si64(s1, s2);
C[4] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x1, y1);
s4 = _mm_add_si64(x4, y4);
s1 = _mm_add_si64(s1, w18);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w20);
s1 = _mm_add_si64(s1, s3);
C[5] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
s3 = _mm_add_si64(x6, y6);
s4 = _mm_add_si64(x8, y8);
s1 = _mm_add_si64(s1, w22);
s3 = _mm_add_si64(s3, s4);
s1 = _mm_add_si64(s1, w26);
s1 = _mm_add_si64(s1, s3);
C[6] = _m_to_int(s1);
s1 = _m_psrlqi(s1, 32);
C[7] = _m_to_int(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
_mm_empty();
}
__declspec(naked) word __fastcall P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
__asm
{
sub esp, 16
xor eax, eax
mov [esp], edi
mov [esp+4], esi
mov [esp+8], ebx
mov [esp+12], ebp
mov ebx, [esp+20] // B
mov esi, [esp+24] // N
// now: ebx = B, ecx = C, edx = A, esi = N
neg esi
jz loopend // if no dwords then nothing to do
mov edi, [edx]
mov ebp, [ebx]
loopstart:
add edi, eax
jc carry1
xor eax, eax
carry1continue:
add edi, ebp
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -