⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 integer.cpp

📁 一个不错的关于手机模块程序This page contains everything that has changed in the history of DC++. Read this to fin
💻 CPP
📖 第 1 页 / 共 5 页
字号:
    R[6] = c;
    R[7] = d + A[0] * B[7] + A[1] * B[6] + A[2] * B[5] + A[3] * B[4] +
               A[4] * B[3] + A[5] * B[2] + A[6] * B[1] + A[7] * B[0];
}


#undef MulAcc
#undef SaveMulAcc
#undef SquAcc
#undef SaveSquAcc

// optimized

#ifdef TAOCRYPT_X86ASM_AVAILABLE

// ************** x86 feature detection ***************

static bool s_sse2Enabled = true;

static void CpuId(word32 input, word32 *output)
{
#ifdef __GNUC__
    __asm__
    (
        // save ebx in case -fPIC is being used
        "push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
        : "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d"(output[3])
        : "a" (input)
    );
#else
    __asm
    {
        mov eax, input
        cpuid
        mov edi, output
        mov [edi], eax
        mov [edi+4], ebx
        mov [edi+8], ecx
        mov [edi+12], edx
    }
#endif
}

#ifdef SSE2_INTRINSICS_AVAILABLE
#ifndef _MSC_VER
static jmp_buf s_env;
static void SigIllHandler(int)
{
    longjmp(s_env, 1);
}
#endif

static bool HasSSE2()
{
    if (!s_sse2Enabled)
        return false;

    word32 cpuid[4];
    CpuId(1, cpuid);
    if ((cpuid[3] & (1 << 26)) == 0)
        return false;

#ifdef _MSC_VER
    __try
    {
        __asm xorpd xmm0, xmm0        // executing SSE2 instruction
    }
    __except (1)
    {
        return false;
    }
    return true;
#else
    typedef void (*SigHandler)(int);

    SigHandler oldHandler = signal(SIGILL, SigIllHandler);
    if (oldHandler == SIG_ERR)
        return false;

    bool result = true;
    if (setjmp(s_env))
        result = false;
    else
        __asm __volatile ("xorps %xmm0, %xmm0");

    signal(SIGILL, oldHandler);
    return result;
#endif
}
#endif

static bool IsP4()
{
    word32 cpuid[4];

    CpuId(0, cpuid);
    mySTL::swap(cpuid[2], cpuid[3]);
    if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
        return false;

    CpuId(1, cpuid);
    return ((cpuid[0] >> 8) & 0xf) == 0xf;
}

// ************** Pentium/P4 optimizations ***************

class PentiumOptimized : public Portable
{
public:
    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
                                   unsigned int N);
    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
                                        unsigned int N);
    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
                                         const word *B);
    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
                                         const word *B);
    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
                                               const word *B);
};

class P4Optimized
{
public:
    static word TAOCRYPT_CDECL Add(word *C, const word *A, const word *B,
                                   unsigned int N);
    static word TAOCRYPT_CDECL Subtract(word *C, const word *A, const word *B,
                                        unsigned int N);
#ifdef SSE2_INTRINSICS_AVAILABLE
    static void TAOCRYPT_CDECL Multiply4(word *C, const word *A,
                                         const word *B);
    static void TAOCRYPT_CDECL Multiply8(word *C, const word *A,
                                         const word *B);
    static void TAOCRYPT_CDECL Multiply8Bottom(word *C, const word *A,
                                               const word *B);
#endif
};

typedef word (TAOCRYPT_CDECL * PAddSub)(word *C, const word *A, const word *B,
                                        unsigned int N);
typedef void (TAOCRYPT_CDECL * PMul)(word *C, const word *A, const word *B);

static PAddSub s_pAdd, s_pSub;
#ifdef SSE2_INTRINSICS_AVAILABLE
static PMul s_pMul4, s_pMul8, s_pMul8B;
#endif

static void SetPentiumFunctionPointers()
{
    if (IsP4())
    {
        s_pAdd = &P4Optimized::Add;
        s_pSub = &P4Optimized::Subtract;
    }
    else
    {
        s_pAdd = &PentiumOptimized::Add;
        s_pSub = &PentiumOptimized::Subtract;
    }

#ifdef SSE2_INTRINSICS_AVAILABLE
    if (HasSSE2())
    {
        s_pMul4 = &P4Optimized::Multiply4;
        s_pMul8 = &P4Optimized::Multiply8;
        s_pMul8B = &P4Optimized::Multiply8Bottom;
    }
    else
    {
        s_pMul4 = &PentiumOptimized::Multiply4;
        s_pMul8 = &PentiumOptimized::Multiply8;
        s_pMul8B = &PentiumOptimized::Multiply8Bottom;
    }
#endif
}

static const char s_RunAtStartupSetPentiumFunctionPointers =
    (SetPentiumFunctionPointers(), 0);

void DisableSSE2()
{
    s_sse2Enabled = false;
    SetPentiumFunctionPointers();
}

class LowLevel : public PentiumOptimized
{
public:
    inline static word Add(word *C, const word *A, const word *B,
                           unsigned int N)
        {return s_pAdd(C, A, B, N);}
    inline static word Subtract(word *C, const word *A, const word *B,
                                unsigned int N)
        {return s_pSub(C, A, B, N);}
    inline static void Square4(word *R, const word *A)
        {Multiply4(R, A, A);}
#ifdef SSE2_INTRINSICS_AVAILABLE
    inline static void Multiply4(word *C, const word *A, const word *B)
        {s_pMul4(C, A, B);}
    inline static void Multiply8(word *C, const word *A, const word *B)
        {s_pMul8(C, A, B);}
    inline static void Multiply8Bottom(word *C, const word *A, const word *B)
        {s_pMul8B(C, A, B);}
#endif
};

// use some tricks to share assembly code between MSVC and GCC
#ifdef _MSC_VER
    #define TAOCRYPT_NAKED __declspec(naked)
    #define AS1(x) __asm x
    #define AS2(x, y) __asm x, y
    #define AddPrologue \
        __asm	push ebp \
        __asm	push ebx \
        __asm	push esi \
        __asm	push edi \
        __asm	mov		ecx, [esp+20] \
        __asm	mov		edx, [esp+24] \
        __asm	mov		ebx, [esp+28] \
        __asm	mov		esi, [esp+32]
    #define AddEpilogue \
        __asm	pop edi \
        __asm	pop esi \
        __asm	pop ebx \
        __asm	pop ebp \
        __asm	ret
    #define MulPrologue \
        __asm	push ebp \
        __asm	push ebx \
        __asm	push esi \
        __asm	push edi \
        __asm	mov ecx, [esp+28] \
        __asm	mov esi, [esp+24] \
        __asm	push [esp+20]
    #define MulEpilogue \
        __asm	add esp, 4 \
        __asm	pop edi \
        __asm	pop esi \
        __asm	pop ebx \
        __asm	pop ebp \
        __asm	ret
#else
    #define TAOCRYPT_NAKED
    #define AS1(x) #x ";"
    #define AS2(x, y) #x ", " #y ";"
    #define AddPrologue \
        __asm__ __volatile__ \
        ( \
            "push %%ebx;"	/* save this manually, in case of -fPIC */ \
            "mov %2, %%ebx;" \
            ".intel_syntax noprefix;" \
            "push ebp;"
    #define AddEpilogue \
            "pop ebp;" \
            ".att_syntax prefix;" \
            "pop %%ebx;" \
                    : \
                    : "c" (C), "d" (A), "m" (B), "S" (N) \
                    : "%edi", "memory", "cc" \
        );
    #define MulPrologue \
        __asm__ __volatile__ \
        ( \
            "push %%ebx;"	/* save this manually, in case of -fPIC */ \
            "push %%ebp;" \
            "push %0;" \
            ".intel_syntax noprefix;"
    #define MulEpilogue \
            "add esp, 4;" \
            "pop ebp;" \
            "pop ebx;" \
            ".att_syntax prefix;" \
            : \
            : "rm" (Z), "S" (X), "c" (Y) \
            : "%eax", "%edx", "%edi", "memory", "cc" \
        );
#endif

TAOCRYPT_NAKED word PentiumOptimized::Add(word *C, const word *A,
                                          const word *B, unsigned int N)
{
    AddPrologue

    // now: ebx = B, ecx = C, edx = A, esi = N
    AS2(    sub ecx, edx)           // hold the distance between C & A so we
                                    // can add this to A to get C
    AS2(    xor eax, eax)           // clear eax

    AS2(    sub eax, esi)           // eax is a negative index from end of B
    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B

    AS2(    sar eax, 1)             // unit of eax is now dwords; this also
                                    // clears the carry flag
    AS1(    jz  loopendAdd)         // if no dwords then nothing to do

    AS1(loopstartAdd:)
    AS2(    mov    esi,[edx])           // load lower word of A
    AS2(    mov    ebp,[edx+4])         // load higher word of A

    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
    AS2(    lea    edx,[edx+8])         // advance A and C

    AS2(    adc    esi,edi)             // add lower words
    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B

    AS2(    adc    ebp,edi)             // add higher words
    AS1(    inc    eax)                 // advance B

    AS2(    mov    [edx+ecx-8],esi)     // store lower word result
    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result

    AS1(    jnz    loopstartAdd)   // loop until eax overflows and becomes zero

    AS1(loopendAdd:)
    AS2(    adc eax, 0)     // store carry into eax (return result register)

    AddEpilogue
}

TAOCRYPT_NAKED word PentiumOptimized::Subtract(word *C, const word *A,
                                               const word *B, unsigned int N)
{
    AddPrologue

    // now: ebx = B, ecx = C, edx = A, esi = N
    AS2(    sub ecx, edx)           // hold the distance between C & A so we
                                    // can add this to A to get C
    AS2(    xor eax, eax)           // clear eax

    AS2(    sub eax, esi)           // eax is a negative index from end of B
    AS2(    lea ebx, [ebx+4*esi])   // ebx is end of B

    AS2(    sar eax, 1)             // unit of eax is now dwords; this also
                                    // clears the carry flag
    AS1(    jz  loopendSub)         // if no dwords then nothing to do

    AS1(loopstartSub:)
    AS2(    mov    esi,[edx])           // load lower word of A
    AS2(    mov    ebp,[edx+4])         // load higher word of A

    AS2(    mov    edi,[ebx+8*eax])     // load lower word of B
    AS2(    lea    edx,[edx+8])         // advance A and C

    AS2(    sbb    esi,edi)             // subtract lower words
    AS2(    mov    edi,[ebx+8*eax+4])   // load higher word of B

    AS2(    sbb    ebp,edi)             // subtract higher words
    AS1(    inc    eax)                 // advance B

    AS2(    mov    [edx+ecx-8],esi)     // store lower word result
    AS2(    mov    [edx+ecx-4],ebp)     // store higher word result

    AS1(    jnz    loopstartSub)   // loop until eax overflows and becomes zero

    AS1(loopendSub:)
    AS2(    adc eax, 0)     // store carry into eax (return result register)

    AddEpilogue
}

// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.

TAOCRYPT_NAKED word P4Optimized::Add(word *C, const word *A, const word *B,
                                     unsigned int N)
{
    AddPrologue

    // now: ebx = B, ecx = C, edx = A, esi = N
    AS2(    xor     eax, eax)
    AS1(    neg     esi)
    AS1(    jz      loopendAddP4)       // if no dwords then nothing to do

    AS2(    mov     edi, [edx])
    AS2(    mov     ebp, [ebx])
    AS1(    jmp     carry1AddP4)

    AS1(loopstartAddP4:)
    AS2(    mov     edi, [edx+8])
    AS2(    add     ecx, 8)
    AS2(    add     edx, 8)
    AS2(    mov     ebp, [ebx])
    AS2(    add     edi, eax)
    AS1(    jc      carry1AddP4)
    AS2(    xor     eax, eax)

    AS1(carry1AddP4:)
    AS2(    add     edi, ebp)
    AS2(    mov     ebp, 1)
    AS2(    mov     [ecx], edi)
    AS2(    mov     edi, [edx+4])
    AS2(    cmovc   eax, ebp)
    AS2(    mov     ebp, [ebx+4])
    AS2(    add     ebx, 8)
    AS2(    add     edi, eax)
    AS1(    jc      carry2AddP4)
    AS2(    xor     eax, eax)

    AS1(carry2AddP4:)
    AS2(    add     edi, ebp)
    AS2(    mov     ebp, 1)
    AS2(    cmovc   eax, ebp)
    AS2(    mov     [ecx+4], edi)
    AS2(    add     esi, 2)
    AS1(    jnz     loopstartAddP4)

    AS1(loopendAddP4:)

    AddEpilogue
}

TAOCRYPT_NAKED word P4Optimized::Subtract(word *C, const word *A,
                                          const word *B, unsigned int N)
{
    AddPrologue

    // now: ebx = B, ecx = C, edx = A, esi = N
    AS2(    xor     eax, eax)
    AS1(    neg     esi)
    AS1(    jz      loopendSubP4)       // if no dwords then nothing to do

    AS2(    mov     edi, [edx])
    AS2(    mov     ebp, [ebx])
    AS1(    jmp     carry1SubP4)

    AS1(loopstartSubP4:)
    AS2(    mov     edi, [edx+8])
    AS2(    add     edx, 8)
    AS2(    add     ecx, 8)
    AS2(    mov     ebp, [ebx])
    AS2(    sub     edi, eax)
    AS1(    jc      carry1SubP4)
    AS2(    xor     eax, eax)

    AS1(carry1SubP4:)
    AS2(    sub     edi, ebp)
    AS2(    mov     ebp, 1)
    AS2(    mov     [ecx], edi)
    AS2(    mov     edi, [edx+4])
    AS2(    cmovc   eax, ebp)
    AS2(    mov     ebp, [ebx+4])
    AS2(    add     ebx, 8)
    AS2(    sub     edi, eax)
    AS1(    jc      carry2SubP4)
    AS2(    xor     eax, eax)

    AS1(carry2SubP4:)
    AS2(    sub     edi, ebp)
    AS2(    mov     ebp, 1)
    AS2(    cmovc   eax, ebp)
    AS2(    mov     [ecx+4], edi)
    AS2(    add     esi, 2)
    AS1(    jnz     loopstartSubP4)

    AS1(loopendSubP4:)

    AddEpilogue
}

// multiply assembly code originally contributed by Leonard Janke

#define MulStartup \
    AS2(xor ebp, ebp) \
    AS2(xor edi, edi) \
    AS2(xor ebx, ebx) 

#define MulShiftCarry \
    AS2(mov ebp, edx) \
    AS2(mov edi, ebx) \
    AS2(xor ebx, ebx)

#define MulAccumulateBottom(i,j) \
    AS2(mov eax, [ecx+4*j]) \
    AS2(imul eax, dword ptr [esi+4*i]) \
    AS2(add ebp, eax)

#define MulAccumulate(i,j) \
    AS2(mov eax, [ecx+4*j]) \
    AS1(mul dword ptr [esi+4*i]) \
    AS2(add ebp, eax) \
    AS2(adc edi, edx) \
    AS2(adc bl, bh)

#define MulStoreDigit(i)  \
    AS2(mov edx, edi) \
    AS2(mov edi, [esp]) \
    AS2(mov [edi+4*i], ebp)

#define MulLastDiagonal(digits) \
    AS2(mov eax, [ecx+4*(digits-1)]) \
    AS1(mul dword ptr [esi+4*(digits-1)]) \
    AS2(add ebp, eax) \
    AS2(adc edx, edi) \
    AS2(mov edi, [esp]) \
    AS2(mov [edi+4*(2*digits-2)], ebp) \
    AS2(mov [edi+4*(2*digits-1)], edx)

TAOCRYPT_NAKED void PentiumOptimized::Multiply4(word* Z, const word* X,
                                                const word* Y)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -