integer.cpp

来自「MySQL源码文件5.X系列, 可自已编译到服务器」· C++ 代码 · 共 2,603 行 · 第 1/5 页

CPP
2,603
字号
        return false;    bool result = true;    if (setjmp(s_env))        result = false;    else        __asm __volatile ("xorpd %xmm0, %xmm0");    signal(SIGILL, oldHandler);    return result;#endif}#endif // SSE2_INTRINSICS_AVAILABLEstatic bool IsP4(){    if (!IsPentium())        return false;    word32 cpuid[4];    CpuId(1, cpuid);    return ((cpuid[0] >> 8) & 0xf) == 0xf;}// ************** Pentium/P4 optimizations ***************class PentiumOptimized : public Portable{public:    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,                                   unsigned int N);    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,                                        unsigned int N);    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,                                         const word *B);    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,                                         const word *B);    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,                                               const word *B);};class P4Optimized{public:    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,                                   unsigned int N);    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,                                        unsigned int N);#ifdef SSE2_INTRINSICS_AVAILABLE    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,                                         const word *B);    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,                                         const word *B);    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,                                               const word *B);#endif};typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,                                        unsigned int N);typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);static PAddSub s_pAdd, s_pSub;#ifdef SSE2_INTRINSICS_AVAILABLEstatic PMul s_pMul4, s_pMul8, s_pMul8B;#endifstatic void SetPentiumFunctionPointers(){    if (!IsPentium())    {           s_pAdd = &Portable::Add;        s_pSub = &Portable::Subtract;    }    else if (IsP4())    {        s_pAdd = &P4Optimized::Add;        s_pSub = &P4Optimized::Subtract;    }    else    {        s_pAdd = &PentiumOptimized::Add;        s_pSub = &PentiumOptimized::Subtract;    }#ifdef SSE2_INTRINSICS_AVAILABLE    if (!IsPentium())     {        s_pMul4 = &Portable::Multiply4;        s_pMul8 = &Portable::Multiply8;        s_pMul8B = &Portable::Multiply8Bottom;    }    else if (HasSSE2())    {        s_pMul4 = &P4Optimized::Multiply4;        s_pMul8 = &P4Optimized::Multiply8;        s_pMul8B = &P4Optimized::Multiply8Bottom;    }    else    {        s_pMul4 = &PentiumOptimized::Multiply4;        s_pMul8 = &PentiumOptimized::Multiply8;        s_pMul8B = &PentiumOptimized::Multiply8Bottom;    }#endif}static const char s_RunAtStartupSetPentiumFunctionPointers =    (SetPentiumFunctionPointers(), 0);class LowLevel : public PentiumOptimized{public:    inline static word Add(word *C, const word *A, const word *B,                           unsigned int N)        {return s_pAdd(C, A, B, N);}    inline static word Subtract(word *C, const word *A, const word *B,                                unsigned int N)        {return s_pSub(C, A, B, N);}    inline static void Square4(word *R, const word *A)        {Multiply4(R, A, A);}#ifdef SSE2_INTRINSICS_AVAILABLE    inline static void Multiply4(word *C, const word *A, const word *B)        {s_pMul4(C, A, B);}    inline static void Multiply8(word *C, const word *A, const word *B)        {s_pMul8(C, A, B);}    inline static void Multiply8Bottom(word *C, const word *A, const word *B)        {s_pMul8B(C, A, B);}#endif};// use some tricks to share assembly code between MSVC and GCC#ifdef _MSC_VER    #define TAOCRYPT_NAKED __declspec(naked)    #define AS1(x) __asm x    #define AS2(x, y) __asm x, y    #define AddPrologue \        __asm	push ebp \        __asm	push ebx \        __asm	push esi \        __asm	push edi \        __asm	mov		ecx, [esp+20] \        __asm	mov		edx, [esp+24] \        __asm	mov		ebx, [esp+28] \        __asm	mov		esi, [esp+32]    #define AddEpilogue \        __asm	pop edi \        __asm	pop esi \        __asm	pop ebx \        __asm	pop ebp \        __asm	ret    #define MulPrologue \        __asm	push ebp \        __asm	push ebx \        __asm	push esi \        __asm	push edi \        __asm	mov ecx, [esp+28] \        __asm	mov esi, [esp+24] \        __asm	push [esp+20]    #define MulEpilogue \        __asm	add esp, 4 \        __asm	pop edi \        __asm	pop esi \        __asm	pop ebx \        __asm	pop ebp \        __asm	ret#else    #define TAOCRYPT_NAKED    #define AS1(x) #x ";"    #define AS2(x, y) #x ", " #y ";"    #define AddPrologue \        __asm__ __volatile__ \        ( \            "push %%ebx;"	/* save this manually, in case of -fPIC */ \            "mov %2, %%ebx;" \            ".intel_syntax noprefix;" \            "push ebp;"    #define AddEpilogue \            "pop ebp;" \            ".att_syntax prefix;" \            "pop %%ebx;" \                    : \                    : "c" (C), "d" (A), "m" (B), "S" (N) \                    : "%edi", "memory", "cc" \        );    #define MulPrologue \        __asm__ __volatile__ \        ( \            "push %%ebx;"	/* save this manually, in case of -fPIC */ \            "push %%ebp;" \            "push %0;" \            ".intel_syntax noprefix;"    #define MulEpilogue \            "add esp, 4;" \            "pop ebp;" \            "pop ebx;" \            ".att_syntax prefix;" \            : \            : "rm" (Z), "S" (X), "c" (Y) \            : "%eax", "%edx", "%edi", "memory", "cc" \        );#endifTAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,                                          const word *B, unsigned int N){    AddPrologue    // now: ebx = B, ecx = C, edx = A, esi = N    AS2(    sub ecx, edx)           // hold the distance between C & A so we                                    // can add this to A to get C    AS2(    xor eax, eax)           // clear eax    AS2(    sub eax, esi)           // eax is a negative index from end of B    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B    AS2(    sar eax, 1)             // unit of eax is now dwords; this also                                    // clears the carry flag    AS1(    jz  loopendAdd)         // if no dwords then nothing to do    AS1(loopstartAdd:)    AS2(    mov    esi,[edx])           // load lower word of A    AS2(    mov    ebp,[edx+4])         // load higher word of A    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B    AS2(    lea    edx,[edx+8])         // advance A and C    AS2(    adc    esi,edi)             // add lower words    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B    AS2(    adc    ebp,edi)             // add higher words    AS1(    inc    eax)                 // advance B    AS2(    mov    [edx+ecx-8],esi)     // store lower word result    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result    AS1(    jnz    loopstartAdd)   // loop until eax overflows and becomes zero    AS1(loopendAdd:)    AS2(    adc eax, 0)     // store carry into eax (return result register)    AddEpilogue}TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,                                               const word *B, unsigned int N){    AddPrologue    // now: ebx = B, ecx = C, edx = A, esi = N    AS2(    sub ecx, edx)           // hold the distance between C & A so we                                    // can add this to A to get C    AS2(    xor eax, eax)           // clear eax    AS2(    sub eax, esi)           // eax is a negative index from end of B    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B    AS2(    sar eax, 1)             // unit of eax is now dwords; this also                                    // clears the carry flag    AS1(    jz  loopendSub)         // if no dwords then nothing to do    AS1(loopstartSub:)    AS2(    mov    esi,[edx])           // load lower word of A    AS2(    mov    ebp,[edx+4])         // load higher word of A    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B    AS2(    lea    edx,[edx+8])         // advance A and C    AS2(    sbb    esi,edi)             // subtract lower words    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B    AS2(    sbb    ebp,edi)             // subtract higher words    AS1(    inc    eax)                 // advance B    AS2(    mov    [edx+ecx-8],esi)     // store lower word result    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result    AS1(    jnz    loopstartSub)   // loop until eax overflows and becomes zero    AS1(loopendSub:)    AS2(    adc eax, 0)     // store carry into eax (return result register)    AddEpilogue}// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,                                     unsigned int N){    AddPrologue    // now: ebx = B, ecx = C, edx = A, esi = N    AS2(    xor     eax, eax)    AS1(    neg     esi)    AS1(    jz      loopendAddP4)       // if no dwords then nothing to do    AS2(    mov     edi, [edx])    AS2(    mov     ebp, [ebx])    AS1(    jmp     carry1AddP4)    AS1(loopstartAddP4:)    AS2(    mov     edi, [edx+8])    AS2(    add     ecx, 8)    AS2(    add     edx, 8)    AS2(    mov     ebp, [ebx])    AS2(    add     edi, eax)    AS1(    jc      carry1AddP4)    AS2(    xor     eax, eax)    AS1(carry1AddP4:)    AS2(    add     edi, ebp)    AS2(    mov     ebp, 1)    AS2(    mov     [ecx], edi)    AS2(    mov     edi, [edx+4])    AS2(    cmovc   eax, ebp)    AS2(    mov     ebp, [ebx+4])    AS2(    add     ebx, 8)    AS2(    add     edi, eax)    AS1(    jc      carry2AddP4)    AS2(    xor     eax, eax)    AS1(carry2AddP4:)    AS2(    add     edi, ebp)    AS2(    mov     ebp, 1)    AS2(    cmovc   eax, ebp)    AS2(    mov     [ecx+4], edi)    AS2(    add     esi, 2)    AS1(    jnz     loopstartAddP4)    AS1(loopendAddP4:)    AddEpilogue}TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,                                          const word *B, unsigned int N){    AddPrologue    // now: ebx = B, ecx = C, edx = A, esi = N    AS2(    xor     eax, eax)    AS1(    neg     esi)    AS1(    jz      loopendSubP4)       // if no dwords then nothing to do    AS2(    mov     edi, [edx])    AS2(    mov     ebp, [ebx])    AS1(    jmp     carry1SubP4)    AS1(loopstartSubP4:)    AS2(    mov     edi, [edx+8])    AS2(    add     edx, 8)    AS2(    add     ecx, 8)    AS2(    mov     ebp, [ebx])    AS2(    sub     edi, eax)    AS1(    jc      carry1SubP4)    AS2(    xor     eax, eax)    AS1(carry1SubP4:)    AS2(    sub     edi, ebp)    AS2(    mov     ebp, 1)    AS2(    mov     [ecx], edi)    AS2(    mov     edi, [edx+4])    AS2(    cmovc   eax, ebp)    AS2(    mov     ebp, [ebx+4])    AS2(    add     ebx, 8)    AS2(    sub     edi, eax)    AS1(    jc      carry2SubP4)    AS2(    xor     eax, eax)    AS1(carry2SubP4:)    AS2(    sub     edi, ebp)    AS2(    mov     ebp, 1)    AS2(    cmovc   eax, ebp)    AS2(    mov     [ecx+4], edi)    AS2(    add     esi, 2)    AS1(    jnz     loopstartSubP4)    AS1(loopendSubP4:)    AddEpilogue}// multiply assembly code originally contributed by Leonard Janke#define MulStartup \    AS2(xor ebp, ebp) \    AS2(xor edi, edi) \    AS2(xor ebx, ebx) #define MulShiftCarry \    AS2(mov ebp, edx) \    AS2(mov edi, ebx) \    AS2(xor ebx, ebx)#define MulAccumulateBottom(i,j) \    AS2(mov eax, [ecx+4*j]) \    AS2(imul eax, dword ptr [esi+4*i]) \    AS2(add ebp, eax)#define MulAccumulate(i,j) \    AS2(mov eax, [ecx+4*j]) \    AS1(mul dword ptr [esi+4*i]) \    AS2(add ebp, eax) \    AS2(adc edi, edx) \    AS2(adc bl, bh)#define MulStoreDigit(i)  \    AS2(mov edx, edi) \    AS2(mov edi, [esp]) \    AS2(mov [edi+4*i], ebp)#define MulLastDiagonal(digits) \    AS2(mov eax, [ecx+4*(digits-1)]) \    AS1(mul dword ptr [esi+4*(digits-1)]) \    AS2(add ebp, eax) \    AS2(adc edx, edi) \    AS2(mov edi, [esp]) \    AS2(mov [edi+4*(2*digits-2)], ebp) \    AS2(mov [edi+4*(2*digits-1)], edx)TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,                                                const word* Y){    MulPrologue    // now: [esp] = Z, esi = X, ecx = Y    MulStartup    MulAccumulate(0,0)    MulStoreDigit(0)    MulShiftCarry    MulAccumulate(1,0)    MulAccumulate(0,1)    MulStoreDigit(1)    MulShiftCarry    MulAccumulate(2,0)    MulAccumulate(1,1)    MulAccumulate(0,2)    MulStoreDigit(2)    MulShiftCarry    MulAccumulate(3,0)    MulAccumulate(2,1)    MulAccumulate(1,2)    MulAccumulate(0,3)    MulStoreDigit(3)    MulShiftCarry    MulAccumulate(3,1)    MulAccumulate(2,2)    MulAccumulate(1,3)    MulStoreDigit(4)    MulShiftCarry    MulAccumulate(3,2)    MulAccumulate(2,3)    MulStoreDigit(5)    MulShiftCarry    MulLastDiagonal(4)    MulEpilogue}TAOCRYPT_NAKED void PentiumOptimized::Multiply8(word* Z, const word* X,                                                const word* Y){    MulPrologue    // now: [esp] = Z, esi = X, ecx = Y    MulStartup    MulAccumulate(0,0)    MulStoreDigit(0)    MulShiftCarry    MulAccumulate(1,0)    MulAccumulate(0,1)    MulStoreDigit(1)    MulShiftCarry    MulAccumulate(2,0)    MulAccumulate(1,1)    MulAccumulate(0,2)    MulStoreDigit(2)    MulShiftCarry    MulAccumulate(3,0)    MulAccumulate(2,1)    MulAccumulate(1,2)    MulAccumulate(0,3)    MulStoreDigit(3)    MulShiftCarry    MulAccumulate(4,0)    MulAccumulate(3,1)    MulAccumulate(2,2)    MulAccumulate(1,3)    MulAccumulate(0,4)    MulStoreDigit(4)    MulShiftCarry    MulAccumulate(5,0)    MulAccumulate(4,1)    MulAccumulate(3,2)    MulAccumulate(2,3)    MulAccumulate(1,4)    MulAccumulate(0,5)    MulStoreDigit(5)    MulShiftCarry    MulAccumulate(6,0)    MulAccumulate(5,1)    MulAccumulate(4,2)    MulAccumulate(3,3)    MulAccumulate(2,4)    MulAccumulate(1,5)    MulAccumulate(0,6)    MulStoreDigit(6)    MulShiftCarry

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?