📄 integer.cpp
字号:
R[6] = c;
R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
}
#undef MulAcc
#undef SaveMulAcc
#undef SquAcc
#undef SaveSquAcc
// optimized
#ifdef TAOCRYPT_X86ASM_AVAILABLE
// ************** x86 feature detection ***************
static bool s_sse2Enabled = true;
static void CpuId(word32 input, word32 *output)
{
#ifdef __GNUC__
__asm__
(
// save ebx in case -fPIC is being used
"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d"(output[3])
: "a" (input)
);
#else
__asm
{
mov eax, input
cpuid
mov edi, output
mov [edi], eax
mov [edi+4], ebx
mov [edi+8], ecx
mov [edi+12], edx
}
#endif
}
#ifdef SSE2_INTRINSICS_AVAILABLE
#ifndef _MSC_VER
static jmp_buf s_env;
static void SigIllHandler(int)
{
longjmp(s_env, 1);
}
#endif
static bool HasSSE2()
{
if (!s_sse2Enabled)
return false;
word32 cpuid[4];
CpuId(1, cpuid);
if ((cpuid[3] & (1 << 26)) == 0)
return false;
#ifdef _MSC_VER
__try
{
__asm xorpd xmm0, xmm0 // executing SSE2 instruction
}
__except (1)
{
return false;
}
return true;
#else
typedef void (*SigHandler)(int);
SigHandler oldHandler = signal(SIGILL, SigIllHandler);
if (oldHandler == SIG_ERR)
return false;
bool result = true;
if (setjmp(s_env))
result = false;
else
__asm __volatile ("xorps %xmm0, %xmm0");
signal(SIGILL, oldHandler);
return result;
#endif
}
#endif
static bool IsP4()
{
word32 cpuid[4];
CpuId(0, cpuid);
mySTL::swap(cpuid[2], cpuid[3]);
if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
return false;
CpuId(1, cpuid);
return ((cpuid[0] >> 8) & 0xf) == 0xf;
}
// ************** Pentium/P4 optimizations ***************
class PentiumOptimized : public Portable
{
public:
static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
unsigned int N);
static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
unsigned int N);
static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
const word *B);
static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
const word *B);
static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
const word *B);
};
class P4Optimized
{
public:
static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
unsigned int N);
static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
unsigned int N);
#ifdef SSE2_INTRINSICS_AVAILABLE
static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
const word *B);
static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
const word *B);
static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
const word *B);
#endif
};
typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
unsigned int N);
typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);
static PAddSub s_pAdd, s_pSub;
#ifdef SSE2_INTRINSICS_AVAILABLE
static PMul s_pMul4, s_pMul8, s_pMul8B;
#endif
static void SetPentiumFunctionPointers()
{
if (IsP4())
{
s_pAdd = &P4Optimized::Add;
s_pSub = &P4Optimized::Subtract;
}
else
{
s_pAdd = &PentiumOptimized::Add;
s_pSub = &PentiumOptimized::Subtract;
}
#ifdef SSE2_INTRINSICS_AVAILABLE
if (HasSSE2())
{
s_pMul4 = &P4Optimized::Multiply4;
s_pMul8 = &P4Optimized::Multiply8;
s_pMul8B = &P4Optimized::Multiply8Bottom;
}
else
{
s_pMul4 = &PentiumOptimized::Multiply4;
s_pMul8 = &PentiumOptimized::Multiply8;
s_pMul8B = &PentiumOptimized::Multiply8Bottom;
}
#endif
}
static const char s_RunAtStartupSetPentiumFunctionPointers =
(SetPentiumFunctionPointers(), 0);
void DisableSSE2()
{
s_sse2Enabled = false;
SetPentiumFunctionPointers();
}
class LowLevel : public PentiumOptimized
{
public:
inline static word Add(word *C, const word *A, const word *B,
unsigned int N)
{return s_pAdd(C, A, B, N);}
inline static word Subtract(word *C, const word *A, const word *B,
unsigned int N)
{return s_pSub(C, A, B, N);}
inline static void Square4(word *R, const word *A)
{Multiply4(R, A, A);}
#ifdef SSE2_INTRINSICS_AVAILABLE
inline static void Multiply4(word *C, const word *A, const word *B)
{s_pMul4(C, A, B);}
inline static void Multiply8(word *C, const word *A, const word *B)
{s_pMul8(C, A, B);}
inline static void Multiply8Bottom(word *C, const word *A, const word *B)
{s_pMul8B(C, A, B);}
#endif
};
// use some tricks to share assembly code between MSVC and GCC
#ifdef _MSC_VER
#define TAOCRYPT_NAKED __declspec(naked)
#define AS1(x) __asm x
#define AS2(x, y) __asm x, y
#define AddPrologue \
__asm push ebp \
__asm push ebx \
__asm push esi \
__asm push edi \
__asm mov ecx, [esp+20] \
__asm mov edx, [esp+24] \
__asm mov ebx, [esp+28] \
__asm mov esi, [esp+32]
#define AddEpilogue \
__asm pop edi \
__asm pop esi \
__asm pop ebx \
__asm pop ebp \
__asm ret
#define MulPrologue \
__asm push ebp \
__asm push ebx \
__asm push esi \
__asm push edi \
__asm mov ecx, [esp+28] \
__asm mov esi, [esp+24] \
__asm push [esp+20]
#define MulEpilogue \
__asm add esp, 4 \
__asm pop edi \
__asm pop esi \
__asm pop ebx \
__asm pop ebp \
__asm ret
#else
#define TAOCRYPT_NAKED
#define AS1(x) #x ";"
#define AS2(x, y) #x ", " #y ";"
#define AddPrologue \
__asm__ __volatile__ \
( \
"push %%ebx;" /* save this manually, in case of -fPIC */ \
"mov %2, %%ebx;" \
".intel_syntax noprefix;" \
"push ebp;"
#define AddEpilogue \
"pop ebp;" \
".att_syntax prefix;" \
"pop %%ebx;" \
: \
: "c" (C), "d" (A), "m" (B), "S" (N) \
: "%edi", "memory", "cc" \
);
#define MulPrologue \
__asm__ __volatile__ \
( \
"push %%ebx;" /* save this manually, in case of -fPIC */ \
"push %%ebp;" \
"push %0;" \
".intel_syntax noprefix;"
#define MulEpilogue \
"add esp, 4;" \
"pop ebp;" \
"pop ebx;" \
".att_syntax prefix;" \
: \
: "rm" (Z), "S" (X), "c" (Y) \
: "%eax", "%edx", "%edi", "memory", "cc" \
);
#endif
TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we
// can add this to A to get C
AS2( xor eax, eax) // clear eax
AS2( sub eax, esi) // eax is a negative index from end of B
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
AS2( sar eax, 1) // unit of eax is now dwords; this also
// clears the carry flag
AS1( jz loopendAdd) // if no dwords then nothing to do
AS1(loopstartAdd:)
AS2( mov esi,[edx]) // load lower word of A
AS2( mov ebp,[edx+4]) // load higher word of A
AS2( mov edi,[ebx+8*eax]) // load lower word of B
AS2( lea edx,[edx+8]) // advance A and C
AS2( adc esi,edi) // add lower words
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
AS2( adc ebp,edi) // add higher words
AS1( inc eax) // advance B
AS2( mov [edx+ecx-8],esi) // store lower word result
AS2( mov [edx+ecx-4],ebp) // store higher word result
AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero
AS1(loopendAdd:)
AS2( adc eax, 0) // store carry into eax (return result register)
AddEpilogue
}
TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( sub ecx, edx) // hold the distance between C & A so we
// can add this to A to get C
AS2( xor eax, eax) // clear eax
AS2( sub eax, esi) // eax is a negative index from end of B
AS2( lea ebx, [ebx+4*esi]) // ebx is end of B
AS2( sar eax, 1) // unit of eax is now dwords; this also
// clears the carry flag
AS1( jz loopendSub) // if no dwords then nothing to do
AS1(loopstartSub:)
AS2( mov esi,[edx]) // load lower word of A
AS2( mov ebp,[edx+4]) // load higher word of A
AS2( mov edi,[ebx+8*eax]) // load lower word of B
AS2( lea edx,[edx+8]) // advance A and C
AS2( sbb esi,edi) // subtract lower words
AS2( mov edi,[ebx+8*eax+4]) // load higher word of B
AS2( sbb ebp,edi) // subtract higher words
AS1( inc eax) // advance B
AS2( mov [edx+ecx-8],esi) // store lower word result
AS2( mov [edx+ecx-4],ebp) // store higher word result
AS1( jnz loopstartSub) // loop until eax overflows and becomes zero
AS1(loopendSub:)
AS2( adc eax, 0) // store carry into eax (return result register)
AddEpilogue
}
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax)
AS1( neg esi)
AS1( jz loopendAddP4) // if no dwords then nothing to do
AS2( mov edi, [edx])
AS2( mov ebp, [ebx])
AS1( jmp carry1AddP4)
AS1(loopstartAddP4:)
AS2( mov edi, [edx+8])
AS2( add ecx, 8)
AS2( add edx, 8)
AS2( mov ebp, [ebx])
AS2( add edi, eax)
AS1( jc carry1AddP4)
AS2( xor eax, eax)
AS1(carry1AddP4:)
AS2( add edi, ebp)
AS2( mov ebp, 1)
AS2( mov [ecx], edi)
AS2( mov edi, [edx+4])
AS2( cmovc eax, ebp)
AS2( mov ebp, [ebx+4])
AS2( add ebx, 8)
AS2( add edi, eax)
AS1( jc carry2AddP4)
AS2( xor eax, eax)
AS1(carry2AddP4:)
AS2( add edi, ebp)
AS2( mov ebp, 1)
AS2( cmovc eax, ebp)
AS2( mov [ecx+4], edi)
AS2( add esi, 2)
AS1( jnz loopstartAddP4)
AS1(loopendAddP4:)
AddEpilogue
}
TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
const word *B, unsigned int N)
{
AddPrologue
// now: ebx = B, ecx = C, edx = A, esi = N
AS2( xor eax, eax)
AS1( neg esi)
AS1( jz loopendSubP4) // if no dwords then nothing to do
AS2( mov edi, [edx])
AS2( mov ebp, [ebx])
AS1( jmp carry1SubP4)
AS1(loopstartSubP4:)
AS2( mov edi, [edx+8])
AS2( add edx, 8)
AS2( add ecx, 8)
AS2( mov ebp, [ebx])
AS2( sub edi, eax)
AS1( jc carry1SubP4)
AS2( xor eax, eax)
AS1(carry1SubP4:)
AS2( sub edi, ebp)
AS2( mov ebp, 1)
AS2( mov [ecx], edi)
AS2( mov edi, [edx+4])
AS2( cmovc eax, ebp)
AS2( mov ebp, [ebx+4])
AS2( add ebx, 8)
AS2( sub edi, eax)
AS1( jc carry2SubP4)
AS2( xor eax, eax)
AS1(carry2SubP4:)
AS2( sub edi, ebp)
AS2( mov ebp, 1)
AS2( cmovc eax, ebp)
AS2( mov [ecx+4], edi)
AS2( add esi, 2)
AS1( jnz loopstartSubP4)
AS1(loopendSubP4:)
AddEpilogue
}
// multiply assembly code originally contributed by Leonard Janke
#define MulStartup \
AS2(xor ebp, ebp) \
AS2(xor edi, edi) \
AS2(xor ebx, ebx)
#define MulShiftCarry \
AS2(mov ebp, edx) \
AS2(mov edi, ebx) \
AS2(xor ebx, ebx)
#define MulAccumulateBottom(i,j) \
AS2(mov eax, [ecx+4*j]) \
AS2(imul eax, dword ptr [esi+4*i]) \
AS2(add ebp, eax)
#define MulAccumulate(i,j) \
AS2(mov eax, [ecx+4*j]) \
AS1(mul dword ptr [esi+4*i]) \
AS2(add ebp, eax) \
AS2(adc edi, edx) \
AS2(adc bl, bh)
#define MulStoreDigit(i) \
AS2(mov edx, edi) \
AS2(mov edi, [esp]) \
AS2(mov [edi+4*i], ebp)
#define MulLastDiagonal(digits) \
AS2(mov eax, [ecx+4*(digits-1)]) \
AS1(mul dword ptr [esi+4*(digits-1)]) \
AS2(add ebp, eax) \
AS2(adc edx, edi) \
AS2(mov edi, [esp]) \
AS2(mov [edi+4*(2*digits-2)], ebp) \
AS2(mov [edi+4*(2*digits-1)], edx)
TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
const word* Y)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -