📄 integer.cpp
字号:
#endifstatic bool HasSSE2(){ if (!s_sse2Enabled) return false; word32 cpuid[4]; CpuId(1, cpuid); if ((cpuid[3] & (1 << 26)) == 0) return false;#ifdef _MSC_VER __try { __asm xorpd xmm0, xmm0 // executing SSE2 instruction } __except (1) { return false; } return true;#else typedef void (*SigHandler)(int); SigHandler oldHandler = signal(SIGILL, SigIllHandler); if (oldHandler == SIG_ERR) return false; bool result = true; if (setjmp(s_env)) result = false; else __asm __volatile ("xorps %xmm0, %xmm0"); signal(SIGILL, oldHandler); return result;#endif}#endifstatic bool IsP4(){ word32 cpuid[4]; CpuId(0, cpuid); mySTL::swap(cpuid[2], cpuid[3]); if (memcmp(cpuid+1, "GenuineIntel", 12) != 0) return false; CpuId(1, cpuid); return ((cpuid[0] >> 8) & 0xf) == 0xf;}// ************** Pentium/P4 optimizations ***************class PentiumOptimized : public Portable{public: static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B, unsigned int N); static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N); static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B); static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B); static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A, const word *B);};class P4Optimized{public: static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B, unsigned int N); static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);#ifdef SSE2_INTRINSICS_AVAILABLE static void TAOCRYPT_CDECL Multiply4(word *C, const word *A, const word *B); static void TAOCRYPT_CDECL Multiply8(word *C, const word *A, const word *B); static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A, const word *B);#endif};typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);static PAddSub s_pAdd, s_pSub;#ifdef SSE2_INTRINSICS_AVAILABLEstatic PMul s_pMul4, s_pMul8, s_pMul8B;#endifstatic void SetPentiumFunctionPointers(){ if (IsP4()) { s_pAdd = &P4Optimized::Add; s_pSub = &P4Optimized::Subtract; } else { s_pAdd = &PentiumOptimized::Add; s_pSub = &PentiumOptimized::Subtract; }#ifdef SSE2_INTRINSICS_AVAILABLE if (HasSSE2()) { s_pMul4 = &P4Optimized::Multiply4; s_pMul8 = &P4Optimized::Multiply8; s_pMul8B = &P4Optimized::Multiply8Bottom; } else { s_pMul4 = &PentiumOptimized::Multiply4; s_pMul8 = &PentiumOptimized::Multiply8; s_pMul8B = &PentiumOptimized::Multiply8Bottom; }#endif}static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);void DisableSSE2(){ s_sse2Enabled = false; SetPentiumFunctionPointers();}class LowLevel : public PentiumOptimized{public: inline static word Add(word *C, const word *A, const word *B, unsigned int N) {return s_pAdd(C, A, B, N);} inline static word Subtract(word *C, const word *A, const word *B, unsigned int N) {return s_pSub(C, A, B, N);} inline static void Square4(word *R, const word *A) {Multiply4(R, A, A);}#ifdef SSE2_INTRINSICS_AVAILABLE inline static void Multiply4(word *C, const word *A, const word *B) {s_pMul4(C, A, B);} inline static void Multiply8(word *C, const word *A, const word *B) {s_pMul8(C, A, B);} inline static void Multiply8Bottom(word *C, const word *A, const word *B) {s_pMul8B(C, A, B);}#endif};// use some tricks to share assembly code between MSVC and GCC#ifdef _MSC_VER #define TAOCRYPT_NAKED __declspec(naked) #define AS1(x) __asm x #define AS2(x, y) __asm x, y #define AddPrologue \ __asm push ebp \ __asm push ebx \ __asm push esi \ __asm push edi \ __asm mov ecx, [esp+20] \ __asm mov edx, [esp+24] \ __asm mov ebx, [esp+28] \ __asm mov esi, [esp+32] #define AddEpilogue \ __asm pop edi \ __asm pop esi \ __asm pop ebx \ __asm pop ebp \ __asm ret #define MulPrologue \ __asm push ebp \ __asm push ebx \ __asm push esi \ __asm push edi \ __asm mov ecx, [esp+28] \ __asm mov esi, [esp+24] \ __asm push [esp+20] #define MulEpilogue \ __asm add esp, 4 \ __asm pop edi \ __asm pop esi \ __asm pop ebx \ __asm pop ebp \ __asm ret#else #define TAOCRYPT_NAKED #define AS1(x) #x ";" #define AS2(x, y) #x ", " #y ";" #define AddPrologue \ __asm__ __volatile__ \ ( \ "push %%ebx;" /* save this manually, in case of -fPIC */ \ "mov %2, %%ebx;" \ ".intel_syntax noprefix;" \ "push ebp;" #define AddEpilogue \ "pop ebp;" \ ".att_syntax prefix;" \ "pop %%ebx;" \ : \ : "c" (C), "d" (A), "m" (B), "S" (N) \ : "%edi", "memory", "cc" \ ); #define MulPrologue \ __asm__ __volatile__ \ ( \ "push %%ebx;" /* save this manually, in case of -fPIC */ \ "push %%ebp;" \ "push %0;" \ ".intel_syntax noprefix;" #define MulEpilogue \ "add esp, 4;" \ "pop ebp;" \ "pop ebx;" \ ".att_syntax prefix;" \ : \ : "rm" (Z), "S" (X), "c" (Y) \ : "%eax", "%edx", "%edi", "memory", "cc" \ );#endifTAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N){ AddPrologue // now: ebx = B, ecx = C, edx = A, esi = N AS2( sub ecx, edx) // hold the distance between C & A so we // can add this to A to get C AS2( xor eax, eax) // clear eax AS2( sub eax, esi) // eax is a negative index from end of B AS2( lea ebx, [ebx+4*esi]) // ebx is end of B AS2( sar eax, 1) // unit of eax is now dwords; this also // clears the carry flag AS1( jz loopendAdd) // if no dwords then nothing to do AS1(loopstartAdd:) AS2( mov esi,[edx]) // load lower word of A AS2( mov ebp,[edx+4]) // load higher word of A AS2( mov edi,[ebx+8*eax]) // load lower word of B AS2( lea edx,[edx+8]) // advance A and C AS2( adc esi,edi) // add lower words AS2( mov edi,[ebx+8*eax+4]) // load higher word of B AS2( adc ebp,edi) // add higher words AS1( inc eax) // advance B AS2( mov [edx+ecx-8],esi) // store lower word result AS2( mov [edx+ecx-4],ebp) // store higher word result AS1( jnz loopstartAdd) // loop until eax overflows and becomes zero AS1(loopendAdd:) AS2( adc eax, 0) // store carry into eax (return result register) AddEpilogue}TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N){ AddPrologue // now: ebx = B, ecx = C, edx = A, esi = N AS2( sub ecx, edx) // hold the distance between C & A so we // can add this to A to get C AS2( xor eax, eax) // clear eax AS2( sub eax, esi) // eax is a negative index from end of B AS2( lea ebx, [ebx+4*esi]) // ebx is end of B AS2( sar eax, 1) // unit of eax is now dwords; this also // clears the carry flag AS1( jz loopendSub) // if no dwords then nothing to do AS1(loopstartSub:) AS2( mov esi,[edx]) // load lower word of A AS2( mov ebp,[edx+4]) // load higher word of A AS2( mov edi,[ebx+8*eax]) // load lower word of B AS2( lea edx,[edx+8]) // advance A and C AS2( sbb esi,edi) // subtract lower words AS2( mov edi,[ebx+8*eax+4]) // load higher word of B AS2( sbb ebp,edi) // subtract higher words AS1( inc eax) // advance B AS2( mov [edx+ecx-8],esi) // store lower word result AS2( mov [edx+ecx-4],ebp) // store higher word result AS1( jnz loopstartSub) // loop until eax overflows and becomes zero AS1(loopendSub:) AS2( adc eax, 0) // store carry into eax (return result register) AddEpilogue}// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N){ AddPrologue // now: ebx = B, ecx = C, edx = A, esi = N AS2( xor eax, eax) AS1( neg esi) AS1( jz loopendAddP4) // if no dwords then nothing to do AS2( mov edi, [edx]) AS2( mov ebp, [ebx]) AS1( jmp carry1AddP4) AS1(loopstartAddP4:) AS2( mov edi, [edx+8]) AS2( add ecx, 8) AS2( add edx, 8) AS2( mov ebp, [ebx]) AS2( add edi, eax) AS1( jc carry1AddP4) AS2( xor eax, eax) AS1(carry1AddP4:) AS2( add edi, ebp) AS2( mov ebp, 1) AS2( mov [ecx], edi) AS2( mov edi, [edx+4]) AS2( cmovc eax, ebp) AS2( mov ebp, [ebx+4]) AS2( add ebx, 8) AS2( add edi, eax) AS1( jc carry2AddP4) AS2( xor eax, eax) AS1(carry2AddP4:) AS2( add edi, ebp) AS2( mov ebp, 1) AS2( cmovc eax, ebp) AS2( mov [ecx+4], edi) AS2( add esi, 2) AS1( jnz loopstartAddP4) AS1(loopendAddP4:) AddEpilogue}TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N){ AddPrologue // now: ebx = B, ecx = C, edx = A, esi = N AS2( xor eax, eax) AS1( neg esi) AS1( jz loopendSubP4) // if no dwords then nothing to do AS2( mov edi, [edx]) AS2( mov ebp, [ebx]) AS1( jmp carry1SubP4) AS1(loopstartSubP4:) AS2( mov edi, [edx+8]) AS2( add edx, 8) AS2( add ecx, 8) AS2( mov ebp, [ebx]) AS2( sub edi, eax) AS1( jc carry1SubP4) AS2( xor eax, eax) AS1(carry1SubP4:) AS2( sub edi, ebp) AS2( mov ebp, 1) AS2( mov [ecx], edi) AS2( mov edi, [edx+4]) AS2( cmovc eax, ebp) AS2( mov ebp, [ebx+4]) AS2( add ebx, 8) AS2( sub edi, eax) AS1( jc carry2SubP4) AS2( xor eax, eax) AS1(carry2SubP4:) AS2( sub edi, ebp) AS2( mov ebp, 1) AS2( cmovc eax, ebp) AS2( mov [ecx+4], edi) AS2( add esi, 2) AS1( jnz loopstartSubP4) AS1(loopendSubP4:) AddEpilogue}// multiply assembly code originally contributed by Leonard Janke#define MulStartup \ AS2(xor ebp, ebp) \ AS2(xor edi, edi) \ AS2(xor ebx, ebx) #define MulShiftCarry \ AS2(mov ebp, edx) \ AS2(mov edi, ebx) \ AS2(xor ebx, ebx)#define MulAccumulateBottom(i,j) \ AS2(mov eax, [ecx+4*j]) \ AS2(imul eax, dword ptr [esi+4*i]) \ AS2(add ebp, eax)#define MulAccumulate(i,j) \ AS2(mov eax, [ecx+4*j]) \ AS1(mul dword ptr [esi+4*i]) \ AS2(add ebp, eax) \ AS2(adc edi, edx) \ AS2(adc bl, bh)#define MulStoreDigit(i) \ AS2(mov edx, edi) \ AS2(mov edi, [esp]) \ AS2(mov [edi+4*i], ebp)#define MulLastDiagonal(digits) \ AS2(mov eax, [ecx+4*(digits-1)]) \ AS1(mul dword ptr [esi+4*(digits-1)]) \ AS2(add ebp, eax) \ AS2(adc edx, edi) \ AS2(mov edi, [esp]) \ AS2(mov [edi+4*(2*digits-2)], ebp) \ AS2(mov [edi+4*(2*digits-1)], edx)TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y){ MulPrologue // now: [esp] = Z, esi = X, ecx = Y MulStartup MulAccumulate(0,0) MulStoreDigit(0) MulShiftCarry MulAccumulate(1,0) MulAccumulate(0,1) MulStoreDigit(1) MulShiftCarry MulAccumulate(2,0) MulAccumulate(1,1) MulAccumulate(0,2) MulStoreDigit(2) MulShiftCarry MulAccumulate(3,0) MulAccumulate(2,1) MulAccumulate(1,2) MulAccumulate(0,3) MulStoreDigit(3) MulShiftCarry MulAccumulate(3,1) MulAccumulate(2,2) MulAccumulate(1,3) MulStoreDigit(4) MulShiftCarry MulAccumulate(3,2) MulAccumulate(2,3) MulStoreDigit(5) MulShiftCarry MulLastDiagonal(4) MulEpilogue}TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y){ MulPrologue // now: [esp] = Z, esi = X, ecx = Y MulStartup MulAccumulate(0,0) MulStoreDigit(0) MulShiftCarry MulAccumulate(1,0) MulAccumulate(0,1) MulStoreDigit(1) MulShiftCarry MulAccumulate(2,0) MulAccumulate(1,1) MulAccumulate(0,2) MulStoreDigit(2) MulShiftCarry MulAccumulate(3,0) MulAccumulate(2,1) MulAccumulate(1,2) MulAccumulate(0,3) MulStoreDigit(3) MulShiftCarry MulAccumulate(4,0) MulAccumulate(3,1) MulAccumulate(2,2) MulAccumulate(1,3) MulAccumulate(0,4) MulStoreDigit(4)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -