📄 integer.cpp
字号:
add edi, ebp
mov ebp, 1
cmovc eax, ebp
mov [ecx+4], edi
add ecx, 8
mov edi, [edx+8]
add edx, 8
add esi, 2
mov ebp, [ebx]
jnz loopstart
loopend:
mov edi, [esp]
mov esi, [esp+4]
mov ebx, [esp+8]
mov ebp, [esp+12]
add esp, 16
ret 8
carry1:
mov eax, 1
jmp carry1continue
carry2:
mov eax, 1
jmp carry2continue
}
}
__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
__asm
{
sub esp, 16
xor eax, eax
mov [esp], edi
mov [esp+4], esi
mov [esp+8], ebx
mov [esp+12], ebp
mov ebx, [esp+20] // B
mov esi, [esp+24] // N
// now: ebx = B, ecx = C, edx = A, esi = N
neg esi
jz loopend // if no dwords then nothing to do
mov edi, [edx]
mov ebp, [ebx]
loopstart:
sub edi, eax
jc carry1
xor eax, eax
carry1continue:
sub edi, ebp
mov ebp, 1
mov [ecx], edi
mov edi, [edx+4]
cmovc eax, ebp
mov ebp, [ebx+4]
lea ebx, [ebx+8]
sub edi, eax
jc carry2
xor eax, eax
carry2continue:
sub edi, ebp
mov ebp, 1
cmovc eax, ebp
mov [ecx+4], edi
add ecx, 8
mov edi, [edx+8]
add edx, 8
add esi, 2
mov ebp, [ebx]
jnz loopstart
loopend:
mov edi, [esp]
mov esi, [esp+4]
mov ebx, [esp+8]
mov ebp, [esp+12]
add esp, 16
ret 8
carry1:
mov eax, 1
jmp carry1continue
carry2:
mov eax, 1
jmp carry2continue
}
}
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
#elif defined(__GNUC__) && defined(__i386__)
class PentiumOptimized : public Portable
{
public:
static word Add(word *C, const word *A, const word *B, unsigned int N);
static word Subtract(word *C, const word *A, const word *B, unsigned int N);
static void Square4(word *R, const word *A);
static void Multiply4(word *C, const word *A, const word *B);
static void Multiply8(word *C, const word *A, const word *B);
};
typedef PentiumOptimized LowLevel;
// Add and Subtract assembly code originally contributed by Alister Lee
__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
assert (N%2 == 0);
register word carry, temp;
__asm__ __volatile__(
"push %%ebp;"
"sub %3, %2;"
"xor %0, %0;"
"sub %4, %0;"
"lea (%1,%4,4), %1;"
"sar $1, %0;"
"jz 1f;"
"0:;"
"mov 0(%3), %4;"
"mov 4(%3), %%ebp;"
"mov (%1,%0,8), %5;"
"lea 8(%3), %3;"
"adc %5, %4;"
"mov 4(%1,%0,8), %5;"
"adc %5, %%ebp;"
"inc %0;"
"mov %4, -8(%3, %2);"
"mov %%ebp, -4(%3, %2);"
"jnz 0b;"
"1:;"
"adc $0, %0;"
"pop %%ebp;"
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
: : "cc", "memory");
return carry;
}
__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
assert (N%2 == 0);
register word carry, temp;
__asm__ __volatile__(
"push %%ebp;"
"sub %3, %2;"
"xor %0, %0;"
"sub %4, %0;"
"lea (%1,%4,4), %1;"
"sar $1, %0;"
"jz 1f;"
"0:;"
"mov 0(%3), %4;"
"mov 4(%3), %%ebp;"
"mov (%1,%0,8), %5;"
"lea 8(%3), %3;"
"sbb %5, %4;"
"mov 4(%1,%0,8), %5;"
"sbb %5, %%ebp;"
"inc %0;"
"mov %4, -8(%3, %2);"
"mov %%ebp, -4(%3, %2);"
"jnz 0b;"
"1:;"
"adc $0, %0;"
"pop %%ebp;"
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
: : "cc", "memory");
return carry;
}
// Comba square and multiply assembly code originally contributed by Leonard Janke
#define SqrStartup \
"push %%ebp\n\t" \
"push %%esi\n\t" \
"push %%ebx\n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%ebx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrShiftCarry \
"mov %%ebx, %%ebp\n\t" \
"mov %%ecx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrAccumulate(i,j) \
"mov 4*"#j"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrAccumulateCentre(i) \
"mov 4*"#i"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrStoreDigit(X) \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
#define SqrLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
"mull 4*("#digits"-1)(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
#define SqrCleanup \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Square4(word* Y, const word* X)
{
__asm__ __volatile__(
SqrStartup
SqrAccumulateCentre(0)
SqrStoreDigit(0)
SqrShiftCarry
SqrAccumulate(1,0)
SqrStoreDigit(1)
SqrShiftCarry
SqrAccumulate(2,0)
SqrAccumulateCentre(1)
SqrStoreDigit(2)
SqrShiftCarry
SqrAccumulate(3,0)
SqrAccumulate(2,1)
SqrStoreDigit(3)
SqrShiftCarry
SqrAccumulate(3,1)
SqrAccumulateCentre(2)
SqrStoreDigit(4)
SqrShiftCarry
SqrAccumulate(3,2)
SqrStoreDigit(5)
SqrShiftCarry
SqrLastDiagonal(4)
SqrCleanup
:
: "D" (Y), "S" (X)
: "eax", "ecx", "edx", "ebp", "memory"
);
}
#define MulStartup \
"push %%ebp\n\t" \
"push %%esi\n\t" \
"push %%ebx\n\t" \
"push %%edi\n\t" \
"mov %%eax, %%ebx \n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%edi, %%edi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define MulShiftCarry \
"mov %%edx, %%ebp\n\t" \
"mov %%ecx, %%edi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define MulAccumulate(i,j) \
"mov 4*"#j"(%%ebx), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%edi\n\t" \
"adc %%ch, %%cl\n\t"
#define MulStoreDigit(X) \
"mov %%edi, %%edx \n\t" \
"mov (%%esp), %%edi \n\t" \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
"mov %%edi, (%%esp)\n\t"
#define MulLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
"mull 4*("#digits"-1)(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edi, %%edx\n\t" \
"mov (%%esp), %%edi\n\t" \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
#define MulCleanup \
"pop %%edi\n\t" \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
{
__asm__ __volatile__(
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(3,2)
MulAccumulate(2,3)
MulStoreDigit(5)
MulShiftCarry
MulLastDiagonal(4)
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
}
void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
{
__asm__ __volatile__(
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(4,0)
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulAccumulate(0,4)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(5,0)
MulAccumulate(4,1)
MulAccumulate(3,2)
MulAccumulate(2,3)
MulAccumulate(1,4)
MulAccumulate(0,5)
MulStoreDigit(5)
MulShiftCarry
MulAccumulate(6,0)
MulAccumulate(5,1)
MulAccumulate(4,2)
MulAccumulate(3,3)
MulAccumulate(2,4)
MulAccumulate(1,5)
MulAccumulate(0,6)
MulStoreDigit(6)
MulShiftCarry
MulAccumulate(7,0)
MulAccumulate(6,1)
MulAccumulate(5,2)
MulAccumulate(4,3)
MulAccumulate(3,4)
MulAccumulate(2,5)
MulAccumulate(1,6)
MulAccumulate(0,7)
MulStoreDigit(7)
MulShiftCarry
MulAccumulate(7,1)
MulAccumulate(6,2)
MulAccumulate(5,3)
MulAccumulate(4,4)
MulAccumulate(3,5)
MulAccumulate(2,6)
MulAccumulate(1,7)
MulStoreDigit(8)
MulShiftCarry
MulAccumulate(7,2)
MulAccumulate(6,3)
MulAccumulate(5,4)
MulAccumulate(4,5)
MulAccumulate(3,6)
MulAccumulate(2,7)
MulStoreDigit(9)
MulShiftCarry
MulAccumulate(7,3)
MulAccumulate(6,4)
MulAccumulate(5,5)
MulAccumulate(4,6)
MulAccumulate(3,7)
MulStoreDigit(10)
MulShiftCarry
MulAccumulate(7,4)
MulAccumulate(6,5)
MulAccumulate(5,6)
MulAccumulate(4,7)
MulStoreDigit(11)
MulShiftCarry
MulAccumulate(7,5)
MulAccumulate(6,6)
MulAccumulate(5,7)
MulStoreDigit(12)
MulShiftCarry
MulAccumulate(7,6)
MulAccumulate(6,7)
MulStoreDigit(13)
MulShiftCarry
MulLastDiagonal(8)
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
}
#elif defined(__GNUC__) && defined(__alpha__)
class AlphaOptimized : public Portable
{
public:
static inline void Multiply2(word *C, const word *A, const word *B);
static inline word Multiply2Add(word *C, const word *A, const word *B);
static inline void Multiply4(word *C, const word *A, const word *B);
static inline unsigned int MultiplyRecursionLimit() {return 4;}
static inline void Multiply4Bottom(word *C, const word *A, const word *B);
static inline unsigned int MultiplyBottomRecursionLimit() {return 4;}
static inline void Square4(word *R, const word *A)
{
Multiply4(R, A, A);
}
};
typedef AlphaOptimized LowLevel;
inline void AlphaOptimized::Multiply2(word *C, const word *A, const word *B)
{
register dword c, a = *(const dword *)A, b = *(const dword *)B;
((dword *)C)[0] = a*b;
__asm__("umulh %1,%2,%0" : "=r" (c) : "r" (a), "r" (b));
((dword *)C)[1] = c;
}
inline word AlphaOptimized::Multiply2Add(word *C, const word *A, const word *B)
{
register dword c, d, e, a = *(const dword *)A, b = *(const dword *)B;
c = ((dword *)C)[0];
d = a*b + c;
__asm__("umulh %1,%2,%0" : "=r" (e) : "r" (a), "r" (b));
((dword *)C)[0] = d;
d = (d < c);
c = ((dword *)C)[1] + d;
d = (c < d);
c += e;
((dword *)C)[1] = c;
d |= (c < e);
return d;
}
inline void AlphaOptimized::Multiply4(word *R, const word *A, const word *B)
{
Multiply2(R, A, B);
Multiply2(R+4, A+2, B+2);
word carry = Multiply2Add(R+2, A+0, B+2);
carry += Multiply2Add(R+2, A+2, B+0);
Increment(R+6, 2, carry);
}
static inline void Multiply2BottomAdd(word *C, const word *A, const word *B)
{
register dword a = *(const dword *)A, b = *(const dword *)B;
((dword *)C)[0] = a*b + ((dword *)C)[0];
}
inline void AlphaOptimized::Multiply4Bottom(word *R, const word *A, const word *B)
{
Multiply2(R, A, B);
Multiply2BottomAdd(R+2, A+0, B+2);
Multiply2BottomAdd(R+2, A+2, B+0);
}
#else // no processor specific code available
typedef Portable LowLevel;
#endif
// ********************************************************
#define A0 A
#define A1 (A+N2)
#define B0 B
#define B1 (B+N2)
#define T0 T
#define T1 (T+N2)
#define T2 (T+N)
#define T3 (T+N+N2)
#define R0 R
#define R1 (R+N2)
#define R2 (R+N)
#define R3 (R+N+N2)
//VC60 workaround: compiler bug triggered without the extra dummy parameters
// R[2*N] - result = A*B
// T[2*N] - temporary work space
// A[N] --- multiplier
// B[N] --- multiplicant
template <class P>
void DoRecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL);
template <class P>
inline void RecursiveMultiply(word *R, word *T, const word *A, const word *B, unsigned int N, const P *dummy=NULL)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -