📄 integer.cpp
字号:
mov ebp, 1
mov [ecx], edi
mov edi, [edx+4]
cmovc eax, ebp
mov ebp, [ebx+4]
lea ebx, [ebx+8]
add edi, eax
jc carry2
xor eax, eax
carry2continue:
add edi, ebp
mov ebp, 1
cmovc eax, ebp
mov [ecx+4], edi
add ecx, 8
mov edi, [edx+8]
add edx, 8
add esi, 2
mov ebp, [ebx]
jnz loopstart
loopend:
mov edi, [esp]
mov esi, [esp+4]
mov ebx, [esp+8]
mov ebp, [esp+12]
add esp, 16
ret 8
carry1:
mov eax, 1
jmp carry1continue
carry2:
mov eax, 1
jmp carry2continue
}
}
__declspec(naked) word __fastcall P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
__asm
{
sub esp, 16
xor eax, eax
mov [esp], edi
mov [esp+4], esi
mov [esp+8], ebx
mov [esp+12], ebp
mov ebx, [esp+20] // B
mov esi, [esp+24] // N
// now: ebx = B, ecx = C, edx = A, esi = N
neg esi
jz loopend // if no dwords then nothing to do
mov edi, [edx]
mov ebp, [ebx]
loopstart:
sub edi, eax
jc carry1
xor eax, eax
carry1continue:
sub edi, ebp
mov ebp, 1
mov [ecx], edi
mov edi, [edx+4]
cmovc eax, ebp
mov ebp, [ebx+4]
lea ebx, [ebx+8]
sub edi, eax
jc carry2
xor eax, eax
carry2continue:
sub edi, ebp
mov ebp, 1
cmovc eax, ebp
mov [ecx+4], edi
add ecx, 8
mov edi, [edx+8]
add edx, 8
add esi, 2
mov ebp, [ebx]
jnz loopstart
loopend:
mov edi, [esp]
mov esi, [esp+4]
mov ebx, [esp+8]
mov ebp, [esp+12]
add esp, 16
ret 8
carry1:
mov eax, 1
jmp carry1continue
carry2:
mov eax, 1
jmp carry2continue
}
}
#endif // #ifdef SSE2_INTRINSICS_AVAILABLE
#elif defined(__GNUC__) && defined(__i386__)
class PentiumOptimized : public Portable
{
public:
#ifndef __pic__ // -fpic uses up a register, leaving too few for the asm code
static word Add(word *C, const word *A, const word *B, unsigned int N);
static word Subtract(word *C, const word *A, const word *B, unsigned int N);
#endif
static void Square4(word *R, const word *A);
static void Multiply4(word *C, const word *A, const word *B);
static void Multiply8(word *C, const word *A, const word *B);
};
typedef PentiumOptimized LowLevel;
// Add and Subtract assembly code originally contributed by Alister Lee
#ifndef __pic__
__attribute__((regparm(3))) word PentiumOptimized::Add(word *C, const word *A, const word *B, unsigned int N)
{
assert (N%2 == 0);
register word carry, temp;
__asm__ __volatile__(
"push %%ebp;"
"sub %3, %2;"
"xor %0, %0;"
"sub %4, %0;"
"lea (%1,%4,4), %1;"
"sar $1, %0;"
"jz 1f;"
"0:;"
"mov 0(%3), %4;"
"mov 4(%3), %%ebp;"
"mov (%1,%0,8), %5;"
"lea 8(%3), %3;"
"adc %5, %4;"
"mov 4(%1,%0,8), %5;"
"adc %5, %%ebp;"
"inc %0;"
"mov %4, -8(%3, %2);"
"mov %%ebp, -4(%3, %2);"
"jnz 0b;"
"1:;"
"adc $0, %0;"
"pop %%ebp;"
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
: : "cc", "memory");
return carry;
}
__attribute__((regparm(3))) word PentiumOptimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
{
assert (N%2 == 0);
register word carry, temp;
__asm__ __volatile__(
"push %%ebp;"
"sub %3, %2;"
"xor %0, %0;"
"sub %4, %0;"
"lea (%1,%4,4), %1;"
"sar $1, %0;"
"jz 1f;"
"0:;"
"mov 0(%3), %4;"
"mov 4(%3), %%ebp;"
"mov (%1,%0,8), %5;"
"lea 8(%3), %3;"
"sbb %5, %4;"
"mov 4(%1,%0,8), %5;"
"sbb %5, %%ebp;"
"inc %0;"
"mov %4, -8(%3, %2);"
"mov %%ebp, -4(%3, %2);"
"jnz 0b;"
"1:;"
"adc $0, %0;"
"pop %%ebp;"
: "=aSD" (carry), "+r" (B), "+r" (C), "+r" (A), "+r" (N), "=r" (temp)
: : "cc", "memory");
return carry;
}
#endif // __pic__
// Comba square and multiply assembly code originally contributed by Leonard Janke
#define SqrStartup \
"push %%ebp\n\t" \
"push %%esi\n\t" \
"push %%ebx\n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%ebx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrShiftCarry \
"mov %%ebx, %%ebp\n\t" \
"mov %%ecx, %%ebx\n\t" \
"xor %%ecx, %%ecx\n\t"
#define SqrAccumulate(i,j) \
"mov 4*"#j"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrAccumulateCentre(i) \
"mov 4*"#i"(%%esi), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"adc %%ch, %%cl\n\t"
#define SqrStoreDigit(X) \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
#define SqrLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%esi), %%eax\n\t" \
"mull 4*("#digits"-1)(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%ebx\n\t" \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
"mov %%ebx, 4*(2*"#digits"-1)(%%edi)\n\t"
#define SqrCleanup \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Square4(word* Y, const word* X)
{
__asm__ __volatile__(
SqrStartup
SqrAccumulateCentre(0)
SqrStoreDigit(0)
SqrShiftCarry
SqrAccumulate(1,0)
SqrStoreDigit(1)
SqrShiftCarry
SqrAccumulate(2,0)
SqrAccumulateCentre(1)
SqrStoreDigit(2)
SqrShiftCarry
SqrAccumulate(3,0)
SqrAccumulate(2,1)
SqrStoreDigit(3)
SqrShiftCarry
SqrAccumulate(3,1)
SqrAccumulateCentre(2)
SqrStoreDigit(4)
SqrShiftCarry
SqrAccumulate(3,2)
SqrStoreDigit(5)
SqrShiftCarry
SqrLastDiagonal(4)
SqrCleanup
:
: "D" (Y), "S" (X)
: "eax", "ecx", "edx", "ebp", "memory"
);
}
#define MulStartup \
"push %%ebp\n\t" \
"push %%esi\n\t" \
"push %%ebx\n\t" \
"push %%edi\n\t" \
"mov %%eax, %%ebx \n\t" \
"xor %%ebp, %%ebp\n\t" \
"xor %%edi, %%edi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define MulShiftCarry \
"mov %%edx, %%ebp\n\t" \
"mov %%ecx, %%edi\n\t" \
"xor %%ecx, %%ecx\n\t"
#define MulAccumulate(i,j) \
"mov 4*"#j"(%%ebx), %%eax\n\t" \
"mull 4*"#i"(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edx, %%edi\n\t" \
"adc %%ch, %%cl\n\t"
#define MulStoreDigit(X) \
"mov %%edi, %%edx \n\t" \
"mov (%%esp), %%edi \n\t" \
"mov %%ebp, 4*"#X"(%%edi)\n\t" \
"mov %%edi, (%%esp)\n\t"
#define MulLastDiagonal(digits) \
"mov 4*("#digits"-1)(%%ebx), %%eax\n\t" \
"mull 4*("#digits"-1)(%%esi)\n\t" \
"add %%eax, %%ebp\n\t" \
"adc %%edi, %%edx\n\t" \
"mov (%%esp), %%edi\n\t" \
"mov %%ebp, 4*(2*"#digits"-2)(%%edi)\n\t" \
"mov %%edx, 4*(2*"#digits"-1)(%%edi)\n\t"
#define MulCleanup \
"pop %%edi\n\t" \
"pop %%ebx\n\t" \
"pop %%esi\n\t" \
"pop %%ebp\n\t"
void PentiumOptimized::Multiply4(word* Z, const word* X, const word* Y)
{
__asm__ __volatile__(
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(3,2)
MulAccumulate(2,3)
MulStoreDigit(5)
MulShiftCarry
MulLastDiagonal(4)
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
}
void PentiumOptimized::Multiply8(word* Z, const word* X, const word* Y)
{
__asm__ __volatile__(
MulStartup
MulAccumulate(0,0)
MulStoreDigit(0)
MulShiftCarry
MulAccumulate(1,0)
MulAccumulate(0,1)
MulStoreDigit(1)
MulShiftCarry
MulAccumulate(2,0)
MulAccumulate(1,1)
MulAccumulate(0,2)
MulStoreDigit(2)
MulShiftCarry
MulAccumulate(3,0)
MulAccumulate(2,1)
MulAccumulate(1,2)
MulAccumulate(0,3)
MulStoreDigit(3)
MulShiftCarry
MulAccumulate(4,0)
MulAccumulate(3,1)
MulAccumulate(2,2)
MulAccumulate(1,3)
MulAccumulate(0,4)
MulStoreDigit(4)
MulShiftCarry
MulAccumulate(5,0)
MulAccumulate(4,1)
MulAccumulate(3,2)
MulAccumulate(2,3)
MulAccumulate(1,4)
MulAccumulate(0,5)
MulStoreDigit(5)
MulShiftCarry
MulAccumulate(6,0)
MulAccumulate(5,1)
MulAccumulate(4,2)
MulAccumulate(3,3)
MulAccumulate(2,4)
MulAccumulate(1,5)
MulAccumulate(0,6)
MulStoreDigit(6)
MulShiftCarry
MulAccumulate(7,0)
MulAccumulate(6,1)
MulAccumulate(5,2)
MulAccumulate(4,3)
MulAccumulate(3,4)
MulAccumulate(2,5)
MulAccumulate(1,6)
MulAccumulate(0,7)
MulStoreDigit(7)
MulShiftCarry
MulAccumulate(7,1)
MulAccumulate(6,2)
MulAccumulate(5,3)
MulAccumulate(4,4)
MulAccumulate(3,5)
MulAccumulate(2,6)
MulAccumulate(1,7)
MulStoreDigit(8)
MulShiftCarry
MulAccumulate(7,2)
MulAccumulate(6,3)
MulAccumulate(5,4)
MulAccumulate(4,5)
MulAccumulate(3,6)
MulAccumulate(2,7)
MulStoreDigit(9)
MulShiftCarry
MulAccumulate(7,3)
MulAccumulate(6,4)
MulAccumulate(5,5)
MulAccumulate(4,6)
MulAccumulate(3,7)
MulStoreDigit(10)
MulShiftCarry
MulAccumulate(7,4)
MulAccumulate(6,5)
MulAccumulate(5,6)
MulAccumulate(4,7)
MulStoreDigit(11)
MulShiftCarry
MulAccumulate(7,5)
MulAccumulate(6,6)
MulAccumulate(5,7)
MulStoreDigit(12)
MulShiftCarry
MulAccumulate(7,6)
MulAccumulate(6,7)
MulStoreDigit(13)
MulShiftCarry
MulLastDiagonal(8)
MulCleanup
:
: "D" (Z), "S" (X), "a" (Y)
: "%ecx", "%edx", "memory"
);
}
#elif defined(__GNUC__) && defined(__alpha__)
class AlphaOptimized : public Portable
{
public:
static inline void Multiply2(word *C, const word *A, const word *B);
static inline word Multiply2Add(word *C, const word *A, const word *B);
static inline void Multiply4(word *C, const word *A, const word *B);
static inline unsigned int MultiplyRecursionLimit() {return 4;}
static inline void Multiply4Bottom(word *C, const word *A, const word *B);
static inline unsigned int MultiplyBottomRecursionLimit() {return 4;}
static inline void Square4(word *R, const word *A)
{
Multiply4(R, A, A);
}
};
typedef AlphaOptimized LowLevel;
inline void AlphaOptimized::Multiply2(word *C, const word *A, const word *B)
{
register dword c, a = *(const dword *)A, b = *(const dword *)B;
((dword *)C)[0] = a*b;
__asm__("umulh %1,%2,%0" : "=r" (c) : "r" (a), "r" (b));
((dword *)C)[1] = c;
}
inline word AlphaOptimized::Multiply2Add(word *C, const word *A, const word *B)
{
register dword c, d, e, a = *(const dword *)A, b = *(const dword *)B;
c = ((dword *)C)[0];
d = a*b + c;
__asm__("umulh %1,%2,%0" : "=r" (e) : "r" (a), "r" (b));
((dword *)C)[0] = d;
d = (d < c);
c = ((dword *)C)[1] + d;
d = (c < d);
c += e;
((dword *)C)[1] = c;
d |= (c < e);
return d;
}
inline void AlphaOptimized::Multiply4(word *R, const word *A, const word *B)
{
Multiply2(R, A, B);
Multiply2(R+4, A+2, B+2);
word carry = Multiply2Add(R+2, A+0, B+2);
carry += Multiply2Add(R+2, A+2, B+0);
Increment(R+6, 2, carry);
}
static inline void Multiply2BottomAdd(word *C, const word *A, const word *B)
{
register dword a = *(const dword *)A, b = *(const dword *)B;
((dword *)C)[0] = a*b + ((dword *)C)[0];
}
inline void AlphaOptimized::Multiply4Bottom(word *R, const word *A, const word *B)
{
Multiply2(R, A, B);
Multiply2BottomAdd(R+2, A+0, B+2);
Multiply2BottomAdd(R+2, A+2, B+0);
}
#else // no processor specific code available
typedef Portable LowLevel;
#endif
// ********************************************************
#define A0 A
#define A1 (A+N2)
#define B0 B
#define B1 (B+N2)
#define T0 T
#define T1 (T+N2)
#define T2 (T+N)
#define T3 (T+N+N2)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -