📄 integer.cpp
字号:
AS2( add esi, 16) \ SSE2_FirstMultiply(1)\ Squ_Acc##i(i) \ AS2( paddd xmm4, xmm4) \ AS2( paddd xmm5, xmm5) \ AS2( movdqa xmm3, [esi]) \ AS2( movq xmm1, QWORD PTR [esi+8]) \ AS2( pmuludq xmm1, xmm3) \ AS2( pmuludq xmm3, xmm3) \ AS2( movdqa xmm0, [ebx])\ AS2( movdqa xmm2, xmm0) \ AS2( pand xmm0, xmm1) \ AS2( psrld xmm1, 16) \ AS2( paddd xmm6, xmm0) \ AS2( paddd xmm7, xmm1) \ AS2( pand xmm2, xmm3) \ AS2( psrld xmm3, 16) \ AS2( paddd xmm6, xmm6) \ AS2( paddd xmm7, xmm7) \ AS2( paddd xmm4, xmm2) \ AS2( paddd xmm5, xmm3) \ AS2( movq xmm0, QWORD PTR [esp+4])\ AS2( movq xmm1, QWORD PTR [esp+12])\ AS2( paddd xmm4, xmm0)\ AS2( paddd xmm5, xmm1)\#define Squ_Column0(k, i) \ Squ_SSE2_SaveShift(k) \ AS2( add edi, 16) \ AS2( add edx, 16) \ SSE2_FirstMultiply(1)\ Squ_Acc##i(i) \ AS2( paddd xmm6, xmm6) \ AS2( paddd xmm7, xmm7) \ AS2( paddd xmm4, xmm4) \ AS2( paddd xmm5, xmm5) \ AS2( movq xmm0, QWORD PTR [esp+4])\ AS2( movq xmm1, QWORD PTR [esp+12])\ AS2( paddd xmm4, xmm0)\ AS2( paddd xmm5, xmm1)\#define SSE2_MulAdd45 \ AS2( movdqa xmm7, [esi]) \ AS2( movdqa xmm0, [edi]) \ AS2( pmuludq xmm0, xmm7) \ AS2( movdqa xmm2, [ebx]) \ AS2( pmuludq xmm7, [edx]) \ AS2( movdqa xmm6, xmm2) \ AS2( pand xmm2, xmm0) \ AS2( psrld xmm0, 16) \ AS2( paddd xmm4, xmm2) \ AS2( paddd xmm5, xmm0) \ AS2( pand xmm6, xmm7) \ AS2( psrld xmm7, 16)#define Mul_Begin(n) \ MulPrologue \ AS2( mov esi, esp)\ AS2( and esp, 0xfffffff0)\ AS2( sub esp, 48*n+16)\ AS1( push esi)\ AS2( xor edx, edx) \ ASL(1) \ ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ AS2( movdqa [esp+20+2*edx], xmm0) \ AS2( psrlq xmm0, 32) \ AS2( movdqa [esp+20+2*edx+16], xmm0) \ AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ AS2( psrlq xmm1, 32) \ AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ AS2( psrlq xmm2, 32) \ AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ AS2( add edx, 16) \ AS2( cmp edx, 8*(n)) \ ASJ( jne, 1, b) \ AS2( lea edi, [esp+20])\ AS2( lea edx, [esp+20+16*n])\ AS2( lea esi, [esp+20+32*n])\ SSE2_FirstMultiply(0) \#define Mul_Acc(i) \ ASL(LMul##i) \ AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( movdqa xmm2, [ebx]) \ AS2( pmuludq xmm0, xmm1) \ AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( movdqa xmm3, xmm2) \ AS2( pand xmm2, xmm0) \ AS2( psrld xmm0, 16) \ AS2( paddd xmm4, xmm2) \ AS2( paddd xmm5, xmm0) \ AS2( pand xmm3, xmm1) \ AS2( psrld xmm1, 16) \ AS2( paddd xmm6, xmm3) \ AS2( paddd xmm7, xmm1) \#define Mul_Acc1(i) #define Mul_Acc2(i) ASC(call, LMul##i)#define Mul_Acc3(i) Mul_Acc2(i)#define Mul_Acc4(i) Mul_Acc2(i)#define Mul_Acc5(i) Mul_Acc2(i)#define Mul_Acc6(i) Mul_Acc2(i)#define Mul_Acc7(i) Mul_Acc2(i)#define Mul_Acc8(i) Mul_Acc2(i)#define Mul_Acc9(i) Mul_Acc2(i)#define Mul_Acc10(i) Mul_Acc2(i)#define Mul_Acc11(i) Mul_Acc2(i)#define Mul_Acc12(i) Mul_Acc2(i)#define Mul_Acc13(i) Mul_Acc2(i)#define Mul_Acc14(i) Mul_Acc2(i)#define Mul_Acc15(i) Mul_Acc2(i)#define Mul_Acc16(i) Mul_Acc2(i)#define Mul_Column1(k, i) \ SSE2_SaveShift(k) \ AS2( add esi, 16) \ SSE2_MulAdd45\ Mul_Acc##i(i) \#define Mul_Column0(k, i) \ SSE2_SaveShift(k) \ AS2( add edi, 16) \ AS2( add edx, 16) \ SSE2_MulAdd45\ Mul_Acc##i(i) \#define Bot_Acc(i) \ AS2( movdqa xmm1, [esi+i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( movdqa xmm0, [edi-i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( pmuludq xmm0, xmm1) \ AS2( pmuludq xmm1, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( paddq xmm4, xmm0) \ AS2( paddd xmm6, xmm1)#define Bot_SaveAcc(k) \ SSE2_SaveShift(k) \ AS2( add edi, 16) \ AS2( add edx, 16) \ AS2( movdqa xmm6, [esi]) \ AS2( movdqa xmm0, [edi]) \ AS2( pmuludq xmm0, xmm6) \ AS2( paddq xmm4, xmm0) \ AS2( psllq xmm5, 16) \ AS2( paddq xmm4, xmm5) \ AS2( pmuludq xmm6, [edx])#define Bot_End(n) \ AS2( movhlps xmm7, xmm6) \ AS2( paddd xmm6, xmm7) \ AS2( psllq xmm6, 32) \ AS2( paddd xmm4, xmm6) \ AS2( movq QWORD PTR [ecx+8*((n)-1)], xmm4) \ AS1( pop esp)\ MulEpilogue#define Top_Begin(n) \ TopPrologue \ AS2( mov edx, esp)\ AS2( and esp, 0xfffffff0)\ AS2( sub esp, 48*n+16)\ AS1( push edx)\ AS2( xor edx, edx) \ ASL(1) \ ASS( pshufd xmm0, [eax+edx], 3,1,2,0) \ ASS( pshufd xmm1, [eax+edx], 2,0,3,1) \ ASS( pshufd xmm2, [edi+edx], 3,1,2,0) \ AS2( movdqa [esp+20+2*edx], xmm0) \ AS2( psrlq xmm0, 32) \ AS2( movdqa [esp+20+2*edx+16], xmm0) \ AS2( movdqa [esp+20+16*n+2*edx], xmm1) \ AS2( psrlq xmm1, 32) \ AS2( movdqa [esp+20+16*n+2*edx+16], xmm1) \ AS2( movdqa [esp+20+32*n+2*edx], xmm2) \ AS2( psrlq xmm2, 32) \ AS2( movdqa [esp+20+32*n+2*edx+16], xmm2) \ AS2( add edx, 16) \ AS2( cmp edx, 8*(n)) \ ASJ( jne, 1, b) \ AS2( mov eax, esi) \ AS2( lea edi, [esp+20+00*n+16*(n/2-1)])\ AS2( lea edx, [esp+20+16*n+16*(n/2-1)])\ AS2( lea esi, [esp+20+32*n+16*(n/2-1)])\ AS2( pxor xmm4, xmm4)\ AS2( pxor xmm5, xmm5)#define Top_Acc(i) \ AS2( movq xmm0, QWORD PTR [esi+i/2*(1-(i-2*(i/2))*2)*16+8]) \ AS2( pmuludq xmm0, [edx-i/2*(1-(i-2*(i/2))*2)*16]) \ AS2( psrlq xmm0, 48) \ AS2( paddd xmm5, xmm0)\#define Top_Column0(i) \ AS2( psllq xmm5, 32) \ AS2( add edi, 16) \ AS2( add edx, 16) \ SSE2_MulAdd45\ Mul_Acc##i(i) \#define Top_Column1(i) \ SSE2_SaveShift(0) \ AS2( add esi, 16) \ SSE2_MulAdd45\ Mul_Acc##i(i) \ AS2( shr eax, 16) \ AS2( movd xmm0, eax)\ AS2( movd xmm1, [ecx+4])\ AS2( psrld xmm1, 16)\ AS2( pcmpgtd xmm1, xmm0)\ AS2( psrld xmm1, 31)\ AS2( paddd xmm4, xmm1)\void SSE2_Square4(word *C, const word *A){ Squ_Begin(2) Squ_Column0(0, 1) Squ_End(2)}void SSE2_Square8(word *C, const word *A){ Squ_Begin(4)#ifndef __GNUC__ ASJ( jmp, 0, f) Squ_Acc(2) AS1( ret) ASL(0)#endif Squ_Column0(0, 1) Squ_Column1(1, 1) Squ_Column0(2, 2) Squ_Column1(3, 1) Squ_Column0(4, 1) Squ_End(4)}void SSE2_Square16(word *C, const word *A){ Squ_Begin(8)#ifndef __GNUC__ ASJ( jmp, 0, f) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) AS1( ret) ASL(0)#endif Squ_Column0(0, 1) Squ_Column1(1, 1) Squ_Column0(2, 2) Squ_Column1(3, 2) Squ_Column0(4, 3) Squ_Column1(5, 3) Squ_Column0(6, 4) Squ_Column1(7, 3) Squ_Column0(8, 3) Squ_Column1(9, 2) Squ_Column0(10, 2) Squ_Column1(11, 1) Squ_Column0(12, 1) Squ_End(8)}void SSE2_Square32(word *C, const word *A){ Squ_Begin(16) ASJ( jmp, 0, f) Squ_Acc(8) Squ_Acc(7) Squ_Acc(6) Squ_Acc(5) Squ_Acc(4) Squ_Acc(3) Squ_Acc(2) AS1( ret) ASL(0) Squ_Column0(0, 1) Squ_Column1(1, 1) Squ_Column0(2, 2) Squ_Column1(3, 2) Squ_Column0(4, 3) Squ_Column1(5, 3) Squ_Column0(6, 4) Squ_Column1(7, 4) Squ_Column0(8, 5) Squ_Column1(9, 5) Squ_Column0(10, 6) Squ_Column1(11, 6) Squ_Column0(12, 7) Squ_Column1(13, 7) Squ_Column0(14, 8) Squ_Column1(15, 7) Squ_Column0(16, 7) Squ_Column1(17, 6) Squ_Column0(18, 6) Squ_Column1(19, 5) Squ_Column0(20, 5) Squ_Column1(21, 4) Squ_Column0(22, 4) Squ_Column1(23, 3) Squ_Column0(24, 3) Squ_Column1(25, 2) Squ_Column0(26, 2) Squ_Column1(27, 1) Squ_Column0(28, 1) Squ_End(16)}void SSE2_Multiply4(word *C, const word *A, const word *B){ Mul_Begin(2)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_End(2)}void SSE2_Multiply8(word *C, const word *A, const word *B){ Mul_Begin(4)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_Column1(1, 3) Mul_Column0(2, 4) Mul_Column1(3, 3) Mul_Column0(4, 2) Mul_End(4)}void SSE2_Multiply16(word *C, const word *A, const word *B){ Mul_Begin(8)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_Column1(1, 3) Mul_Column0(2, 4) Mul_Column1(3, 5) Mul_Column0(4, 6) Mul_Column1(5, 7) Mul_Column0(6, 8) Mul_Column1(7, 7) Mul_Column0(8, 6) Mul_Column1(9, 5) Mul_Column0(10, 4) Mul_Column1(11, 3) Mul_Column0(12, 2) Mul_End(8)}void SSE2_Multiply32(word *C, const word *A, const word *B){ Mul_Begin(16) ASJ( jmp, 0, f) Mul_Acc(16) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0) Mul_Column0(0, 2) Mul_Column1(1, 3) Mul_Column0(2, 4) Mul_Column1(3, 5) Mul_Column0(4, 6) Mul_Column1(5, 7) Mul_Column0(6, 8) Mul_Column1(7, 9) Mul_Column0(8, 10) Mul_Column1(9, 11) Mul_Column0(10, 12) Mul_Column1(11, 13) Mul_Column0(12, 14) Mul_Column1(13, 15) Mul_Column0(14, 16) Mul_Column1(15, 15) Mul_Column0(16, 14) Mul_Column1(17, 13) Mul_Column0(18, 12) Mul_Column1(19, 11) Mul_Column0(20, 10) Mul_Column1(21, 9) Mul_Column0(22, 8) Mul_Column1(23, 7) Mul_Column0(24, 6) Mul_Column1(25, 5) Mul_Column0(26, 4) Mul_Column1(27, 3) Mul_Column0(28, 2) Mul_End(16)}void SSE2_MultiplyBottom4(word *C, const word *A, const word *B){ Mul_Begin(2) Bot_SaveAcc(0) Bot_Acc(2) Bot_End(2)}void SSE2_MultiplyBottom8(word *C, const word *A, const word *B){ Mul_Begin(4)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_Column1(1, 3) Bot_SaveAcc(2) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) Bot_End(4)}void SSE2_MultiplyBottom16(word *C, const word *A, const word *B){ Mul_Begin(8)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_Column1(1, 3) Mul_Column0(2, 4) Mul_Column1(3, 5) Mul_Column0(4, 6) Mul_Column1(5, 7) Bot_SaveAcc(6) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) Bot_End(8)}void SSE2_MultiplyBottom32(word *C, const word *A, const word *B){ Mul_Begin(16)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(15) Mul_Acc(14) Mul_Acc(13) Mul_Acc(12) Mul_Acc(11) Mul_Acc(10) Mul_Acc(9) Mul_Acc(8) Mul_Acc(7) Mul_Acc(6) Mul_Acc(5) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Mul_Column0(0, 2) Mul_Column1(1, 3) Mul_Column0(2, 4) Mul_Column1(3, 5) Mul_Column0(4, 6) Mul_Column1(5, 7) Mul_Column0(6, 8) Mul_Column1(7, 9) Mul_Column0(8, 10) Mul_Column1(9, 11) Mul_Column0(10, 12) Mul_Column1(11, 13) Mul_Column0(12, 14) Mul_Column1(13, 15) Bot_SaveAcc(14) Bot_Acc(16) Bot_Acc(15) Bot_Acc(14) Bot_Acc(13) Bot_Acc(12) Bot_Acc(11) Bot_Acc(10) Bot_Acc(9) Bot_Acc(8) Bot_Acc(7) Bot_Acc(6) Bot_Acc(5) Bot_Acc(4) Bot_Acc(3) Bot_Acc(2) Bot_End(16)}void SSE2_MultiplyTop8(word *C, const word *A, const word *B, word L){ Top_Begin(4) Top_Acc(3) Top_Acc(2) Top_Acc(1)#ifndef __GNUC__ ASJ( jmp, 0, f) Mul_Acc(4) Mul_Acc(3) Mul_Acc(2) AS1( ret) ASL(0)#endif Top_Column0(4) Top_Column1(3) Mul_Column0(0, 2) Top_End(2)}void SSE2_MultiplyTop16(word *C, const word *A, const word *B, word L){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -