📄 integer.cpp
字号:
AS2( movdqa xmm2, [ebx]) \
AS2( pmuludq xmm0, xmm1) \
AS2( pmuludq xmm1, [edx-(i)*16]) \
AS2( movdqa xmm3, xmm2) \
AS2( pand xmm2, xmm0) \
AS2( psrld xmm0, 16) \
AS2( paddd xmm4, xmm2) \
AS2( paddd xmm5, xmm0) \
AS2( pand xmm3, xmm1) \
AS2( psrld xmm1, 16) \
AS2( paddd xmm6, xmm3) \
AS2( paddd xmm7, xmm1) \
#define Squ_Acc1(i)
#define Squ_Acc2(i) ASC(call, LSqu##i)
#define Squ_Acc3(i) Squ_Acc2(i)
#define Squ_Acc4(i) Squ_Acc2(i)
#define Squ_Acc5(i) Squ_Acc2(i)
#define Squ_Acc6(i) Squ_Acc2(i)
#define Squ_Acc7(i) Squ_Acc2(i)
#define Squ_Acc8(i) Squ_Acc2(i)
#define SSE2_End(E, n) \
SSE2_SaveShift(2*(n)-3) \
AS2( movdqa xmm7, [esi+16]) \
AS2( movdqa xmm0, [edi]) \
AS2( pmuludq xmm0, xmm7) \
AS2( movdqa xmm2, [ebx]) \
AS2( pmuludq xmm7, [edx]) \
AS2( movdqa xmm6, xmm2) \
AS2( pand xmm2, xmm0) \
AS2( psrld xmm0, 16) \
AS2( paddd xmm4, xmm2) \
AS2( paddd xmm5, xmm0) \
AS2( pand xmm6, xmm7) \
AS2( psrld xmm7, 16) \
SSE2_SaveShift(2*(n)-2) \
SSE2_FinalSave(2*(n)-1) \
AS1( pop esp)\
E
#define Squ_End(n) SSE2_End(SquEpilogue, n)
#define Mul_End(n) SSE2_End(MulEpilogue, n)
#define Top_End(n) SSE2_End(TopEpilogue, n)
#define Squ_Column1(k, i) \
Squ_SSE2_SaveShift(k) \
AS2( add esi, 16) \
SSE2_FirstMultiply(1)\
Squ_Acc##i(i) \
AS2( paddd xmm4, xmm4) \
AS2( paddd xmm5, xmm5) \
AS2( movdqa xmm3, [esi]) \
AS2( movq xmm1, QWORD PTR [esi+8]) \
AS2( pmuludq xmm1, xmm3) \
AS2( pmuludq xmm3, xmm3) \
AS2( movdqa xmm0, [ebx])\
AS2( movdqa xmm2, xmm0) \
AS2( pand xmm0, xmm1) \
AS2( psrld xmm1, 16) \
AS2( paddd xmm6, xmm0) \
AS2( paddd xmm7, xmm1) \
AS2( pand xmm2, xmm3) \
AS2( psrld xmm3, 16) \
AS2( paddd xmm6, xmm6) \
AS2( paddd xmm7, xmm7) \
AS2( paddd xmm4, xmm2) \
AS2( paddd xmm5, xmm3) \
AS2( movq xmm0, QWORD PTR [esp+4])\
AS2( movq xmm1, QWORD PTR [esp+12])\
AS2( paddd xmm4, xmm0)\
AS2( paddd xmm5, xmm1)\
#define Squ_Column0(k, i) \
Squ_SSE2_SaveShift(k) \
AS2( add edi, 16) \
AS2( add edx, 16) \
SSE2_FirstMultiply(1)\
Squ_Acc##i(i) \
AS2( paddd xmm6, xmm6) \
AS2( paddd xmm7, xmm7) \
AS2( paddd xmm4, xmm4) \
AS2( paddd xmm5, xmm5) \
AS2( movq xmm0, QWORD PTR [esp+4])\
AS2( movq xmm1, QWORD PTR [esp+12])\
AS2( paddd xmm4, xmm0)\
AS2( paddd xmm5, xmm1)\
#define SSE2_MulAdd45 \
AS2( movdqa xmm7, [esi]) \
AS2( movdqa xmm0, [edi]) \
AS2( pmuludq xmm0, xmm7) \
AS2( movdqa xmm2, [ebx]) \
AS2( pmuludq xmm7, [edx]) \
AS2( movdqa xmm6, xmm2) \
AS2( pand xmm2, xmm0) \
AS2( psrld xmm0, 16) \
AS2( paddd xmm4, xmm2) \
AS2( paddd xmm5, xmm0) \
AS2( pand xmm6, xmm7) \
AS2( psrld xmm7, 16)
#define Mul_Begin(n) \
MulPrologue \
AS2( mov esi, esp)\
AS2( and esp, 0xfffffff0)\
AS2( sub esp, 48*n+16)\
AS1( push esi)\
AS2( xor edx, edx) \
ASL(1) \
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
AS2( movdqa [esp+20+2*edx], xmm0) \
AS2( psrlq xmm0, 32) \
AS2( movdqa [esp+20+2*edx+16], xmm0) \
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
AS2( psrlq xmm1, 32) \
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
AS2( psrlq xmm2, 32) \
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
AS2( add edx, 16) \
AS2( cmp edx, 8*(n)) \
ASJ( jne, 1, b) \
AS2( lea edi, [esp+20])\
AS2( lea edx, [esp+20+16*n])\
AS2( lea esi, [esp+20+32*n])\
SSE2_FirstMultiply(0) \
#define Mul_Acc(i) \
ASL(LMul##i) \
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( movdqa xmm2, [ebx]) \
AS2( pmuludq xmm0, xmm1) \
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( movdqa xmm3, xmm2) \
AS2( pand xmm2, xmm0) \
AS2( psrld xmm0, 16) \
AS2( paddd xmm4, xmm2) \
AS2( paddd xmm5, xmm0) \
AS2( pand xmm3, xmm1) \
AS2( psrld xmm1, 16) \
AS2( paddd xmm6, xmm3) \
AS2( paddd xmm7, xmm1) \
#define Mul_Acc1(i)
#define Mul_Acc2(i) ASC(call, LMul##i)
#define Mul_Acc3(i) Mul_Acc2(i)
#define Mul_Acc4(i) Mul_Acc2(i)
#define Mul_Acc5(i) Mul_Acc2(i)
#define Mul_Acc6(i) Mul_Acc2(i)
#define Mul_Acc7(i) Mul_Acc2(i)
#define Mul_Acc8(i) Mul_Acc2(i)
#define Mul_Acc9(i) Mul_Acc2(i)
#define Mul_Acc10(i) Mul_Acc2(i)
#define Mul_Acc11(i) Mul_Acc2(i)
#define Mul_Acc12(i) Mul_Acc2(i)
#define Mul_Acc13(i) Mul_Acc2(i)
#define Mul_Acc14(i) Mul_Acc2(i)
#define Mul_Acc15(i) Mul_Acc2(i)
#define Mul_Acc16(i) Mul_Acc2(i)
#define Mul_Column1(k, i) \
SSE2_SaveShift(k) \
AS2( add esi, 16) \
SSE2_MulAdd45\
Mul_Acc##i(i) \
#define Mul_Column0(k, i) \
SSE2_SaveShift(k) \
AS2( add edi, 16) \
AS2( add edx, 16) \
SSE2_MulAdd45\
Mul_Acc##i(i) \
#define Bot_Acc(i) \
AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( pmuludq xmm0, xmm1) \
AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( paddq xmm4, xmm0) \
AS2( paddd xmm6, xmm1)
#define Bot_SaveAcc(k) \
SSE2_SaveShift(k) \
AS2( add edi, 16) \
AS2( add edx, 16) \
AS2( movdqa xmm6, [esi]) \
AS2( movdqa xmm0, [edi]) \
AS2( pmuludq xmm0, xmm6) \
AS2( paddq xmm4, xmm0) \
AS2( psllq xmm5, 16) \
AS2( paddq xmm4, xmm5) \
AS2( pmuludq xmm6, [edx])
#define Bot_End(n) \
AS2( movhlps xmm7, xmm6) \
AS2( paddd xmm6, xmm7) \
AS2( psllq xmm6, 32) \
AS2( paddd xmm4, xmm6) \
AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \
AS1( pop esp)\
MulEpilogue
#define Top_Begin(n) \
TopPrologue \
AS2( mov edx, esp)\
AS2( and esp, 0xfffffff0)\
AS2( sub esp, 48*n+16)\
AS1( push edx)\
AS2( xor edx, edx) \
ASL(1) \
ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \
ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \
ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \
AS2( movdqa [esp+20+2*edx], xmm0) \
AS2( psrlq xmm0, 32) \
AS2( movdqa [esp+20+2*edx+16], xmm0) \
AS2( movdqa [esp+20+16*n+2*edx], xmm1) \
AS2( psrlq xmm1, 32) \
AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \
AS2( movdqa [esp+20+32*n+2*edx], xmm2) \
AS2( psrlq xmm2, 32) \
AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \
AS2( add edx, 16) \
AS2( cmp edx, 8*(n)) \
ASJ( jne, 1, b) \
AS2( mov eax, esi) \
AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\
AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\
AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\
AS2( pxor xmm4, xmm4)\
AS2( pxor xmm5, xmm5)
#define Top_Acc(i) \
AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \
AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \
AS2( psrlq xmm0, 48) \
AS2( paddd xmm5, xmm0)\
#define Top_Column0(i) \
AS2( psllq xmm5, 32) \
AS2( add edi, 16) \
AS2( add edx, 16) \
SSE2_MulAdd45\
Mul_Acc##i(i) \
#define Top_Column1(i) \
SSE2_SaveShift(0) \
AS2( add esi, 16) \
SSE2_MulAdd45\
Mul_Acc##i(i) \
AS2( shr eax, 16) \
AS2( movd xmm0, eax)\
AS2( movd xmm1, [ecx+4])\
AS2( psrld xmm1, 16)\
AS2( pcmpgtd xmm1, xmm0)\
AS2( psrld xmm1, 31)\
AS2( paddd xmm4, xmm1)\
void SSE2_Square4(word *C, const word *A)
{
Squ_Begin(2)
Squ_Column0(0, 1)
Squ_End(2)
}
void SSE2_Square8(word *C, const word *A)
{
Squ_Begin(4)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Squ_Acc(2)
AS1( ret) ASL(0)
#endif
Squ_Column0(0, 1)
Squ_Column1(1, 1)
Squ_Column0(2, 2)
Squ_Column1(3, 1)
Squ_Column0(4, 1)
Squ_End(4)
}
void SSE2_Square16(word *C, const word *A)
{
Squ_Begin(8)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
AS1( ret) ASL(0)
#endif
Squ_Column0(0, 1)
Squ_Column1(1, 1)
Squ_Column0(2, 2)
Squ_Column1(3, 2)
Squ_Column0(4, 3)
Squ_Column1(5, 3)
Squ_Column0(6, 4)
Squ_Column1(7, 3)
Squ_Column0(8, 3)
Squ_Column1(9, 2)
Squ_Column0(10, 2)
Squ_Column1(11, 1)
Squ_Column0(12, 1)
Squ_End(8)
}
void SSE2_Square32(word *C, const word *A)
{
Squ_Begin(16)
ASJ( jmp, 0, f)
Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2)
AS1( ret) ASL(0)
Squ_Column0(0, 1)
Squ_Column1(1, 1)
Squ_Column0(2, 2)
Squ_Column1(3, 2)
Squ_Column0(4, 3)
Squ_Column1(5, 3)
Squ_Column0(6, 4)
Squ_Column1(7, 4)
Squ_Column0(8, 5)
Squ_Column1(9, 5)
Squ_Column0(10, 6)
Squ_Column1(11, 6)
Squ_Column0(12, 7)
Squ_Column1(13, 7)
Squ_Column0(14, 8)
Squ_Column1(15, 7)
Squ_Column0(16, 7)
Squ_Column1(17, 6)
Squ_Column0(18, 6)
Squ_Column1(19, 5)
Squ_Column0(20, 5)
Squ_Column1(21, 4)
Squ_Column0(22, 4)
Squ_Column1(23, 3)
Squ_Column0(24, 3)
Squ_Column1(25, 2)
Squ_Column0(26, 2)
Squ_Column1(27, 1)
Squ_Column0(28, 1)
Squ_End(16)
}
void SSE2_Multiply4(word *C, const word *A, const word *B)
{
Mul_Begin(2)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Mul_Acc(2)
AS1( ret) ASL(0)
#endif
Mul_Column0(0, 2)
Mul_End(2)
}
void SSE2_Multiply8(word *C, const word *A, const word *B)
{
Mul_Begin(4)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
AS1( ret) ASL(0)
#endif
Mul_Column0(0, 2)
Mul_Column1(1, 3)
Mul_Column0(2, 4)
Mul_Column1(3, 3)
Mul_Column0(4, 2)
Mul_End(4)
}
void SSE2_Multiply16(word *C, const word *A, const word *B)
{
Mul_Begin(8)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
AS1( ret) ASL(0)
#endif
Mul_Column0(0, 2)
Mul_Column1(1, 3)
Mul_Column0(2, 4)
Mul_Column1(3, 5)
Mul_Column0(4, 6)
Mul_Column1(5, 7)
Mul_Column0(6, 8)
Mul_Column1(7, 7)
Mul_Column0(8, 6)
Mul_Column1(9, 5)
Mul_Column0(10, 4)
Mul_Column1(11, 3)
Mul_Column0(12, 2)
Mul_End(8)
}
void SSE2_Multiply32(word *C, const word *A, const word *B)
{
Mul_Begin(16)
ASJ( jmp, 0, f)
Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2)
AS1( ret) ASL(0)
Mul_Column0(0, 2)
Mul_Column1(1, 3)
Mul_Column0(2, 4)
Mul_Column1(3, 5)
Mul_Column0(4, 6)
Mul_Column1(5, 7)
Mul_Column0(6, 8)
Mul_Column1(7, 9)
Mul_Column0(8, 10)
Mul_Column1(9, 11)
Mul_Column0(10, 12)
Mul_Column1(11, 13)
Mul_Column0(12, 14)
Mul_Column1(13, 15)
Mul_Column0(14, 16)
Mul_Column1(15, 15)
Mul_Column0(16, 14)
Mul_Column1(17, 13)
Mul_Column0(18, 12)
Mul_Column1(19, 11)
Mul_Column0(20, 10)
Mul_Column1(21, 9)
Mul_Column0(22, 8)
Mul_Column1(23, 7)
Mul_Column0(24, 6)
Mul_Column1(25, 5)
Mul_Column0(26, 4)
Mul_Column1(27, 3)
Mul_Column0(28, 2)
Mul_End(16)
}
void SSE2_MultiplyBottom4(word *C, const word *A, const word *B)
{
Mul_Begin(2)
Bot_SaveAcc(0) Bot_Acc(2)
Bot_End(2)
}
void SSE2_MultiplyBottom8(word *C, const word *A, const word *B)
{
Mul_Begin(4)
#ifndef __GNUC__
ASJ( jmp, 0, f)
Mul_Acc(3) Mul_Acc(2)
AS1( ret) ASL(0)
#endif
Mul_Column0(0, 2)
Mul_Column1(1, 3)
Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -