📄 sse.cpp
字号:
#include "SIMD.h"
__int64 sse_cacc(void *A, void *B, int cnt)
{
int cnt1;
int cnt2;
__int64 result;
if(((int)A%16) || ((int)B%16))
{
cnt1 = cnt / 6;
cnt2 = (cnt - (6*cnt1));
__asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
pxor xmm0, xmm0; // Clear the running sum
pxor mm0, mm0; // Clear the running sum
jecxz ZERO1;
L1:
movlpd xmm1, [edi]; //load IF data
movlpd xmm2, [edi+8]; //load IF data
movlpd xmm3, [edi+16]; //load IF data
movupd xmm4, [esi]; //load Sine data
movupd xmm5, [esi+16]; //load Sine data
movupd xmm6, [esi+32]; //load Sine data
punpckldq xmm1, xmm1; //copies bits 0..31 to 32..63 and bits 32..63 to 64..95 and 65..127
punpckldq xmm2, xmm2; //copies bits 0..63 to 64..127
punpckldq xmm3, xmm3; //copies bits 0..63 to 64..127
pmaddwd xmm1, xmm4; //multiply and add, result in xmm1
pmaddwd xmm2, xmm5; //multiply and add, result in xmm2
pmaddwd xmm3, xmm6; //multiply and add, result in xmm3
paddd xmm0, xmm3; //Add into accumulator (efficiently)
paddd xmm1, xmm2;
paddd xmm0, xmm1;
add edi, 24; //move in complex sine by 24 bytes
add esi, 48; //move in IF array by 48 bytes
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2;
jecxz ZERO2;
L2:
movq mm1, [edi]; //load IF data
punpckldq mm1, mm1; //copy bottom 32 bits of IF data into high 32 bits
pmaddwd mm1, [esi]; //perform mmx complex multiply
paddd mm0, mm1; //add into accumulator
add edi, 4; //move in complex sine by 4 bytes
add esi, 8; //move in IF array by 8 bytes
loop L2;
ZERO2:
movdq2q mm1, xmm0;
punpckhqdq xmm0, xmm0; //move bits 64..127 of xmm0 into 0..63 of xmm0
movdq2q mm2, xmm0;
paddd mm0, mm1; //add together
paddd mm0, mm2; //add together
movq result, mm0;
EMMS; // done with MMX
}//end __asm
}
else
{
cnt1 = cnt / 12;
cnt2 = (cnt - (12*cnt1));
__asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
pxor xmm0, xmm0; // Clear the running sum
pxor mm0, mm0; // Clear the running sum
jecxz AZERO1;
AL1:
movlpd xmm1, [edi]; //load IF data
movlpd xmm2, [edi+8]; //load IF data
movlpd xmm3, [edi+16]; //load IF data
movlpd xmm4, [edi+24]; //load IF data
movlpd xmm5, [edi+32]; //load IF data
movlpd xmm6, [edi+40]; //load IF data
punpckldq xmm1, xmm1; //copies bits 0..31 to 32..63 and bits 32..63 to 64..95 and 65..127
punpckldq xmm2, xmm2; //copies bits 0..63 to 64..127
punpckldq xmm3, xmm3; //copies bits 0..63 to 64..127
punpckldq xmm4, xmm4; //copies bits 0..63 to 64..127
punpckldq xmm5, xmm5; //copies bits 0..63 to 64..127
punpckldq xmm6, xmm6; //copies bits 0..63 to 64..127
pmaddwd xmm1, [esi]; //multiply and add, result in xmm1
pmaddwd xmm2, [esi+16]; //multiply and add, result in xmm2
pmaddwd xmm3, [esi+32]; //multiply and add, result in xmm3
pmaddwd xmm4, [esi+48]; //multiply and add, result in xmm4
pmaddwd xmm5, [esi+64]; //multiply and add, result in xmm5
pmaddwd xmm6, [esi+80]; //multiply and add, result in xmm6
paddd xmm1, xmm2; //Add into accumulator (efficiently)
paddd xmm3, xmm4;
paddd xmm5, xmm6;
paddd xmm1, xmm3;
paddd xmm0, xmm5;
paddd xmm0, xmm1;
add edi, 48; //move in complex sine by 56 bytes
add esi, 96; //move in IF array by 112 bytes
loop AL1; // Loop if not done
AZERO1:
mov ecx, cnt2;
jecxz AZERO2;
AL2:
movq mm1, [edi]; //load IF data
punpckldq mm1, mm1; //copy bottom 32 bits of IF data into high 32 bits
pmaddwd mm1, [esi]; //perform mmx complex multiply
paddd mm0, mm1; //add into accumulator
add edi, 4; //move in complex sine by 4 bytes
add esi, 8; //move in IF array by 8 bytes
loop AL2;
AZERO2:
movdq2q mm1, xmm0;
punpckhqdq xmm0, xmm0; //move bits 64..127 of xmm0 into 0..63 of xmm0
movdq2q mm2, xmm0;
paddd mm0, mm1; //add together
paddd mm0, mm2; //add together
movq result, mm0;
EMMS; // done with MMX
}//end __asm
}//end if
return(result);
}
int sse_dot(void *A, void *B, int cnt)
{
int temp;
int temp2 = 0;
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16)) //If the memory locations are not 16 byte aligned use slower movupd instruction
{
cnt1 = cnt / 24;
cnt2 = (cnt - (24*cnt1)) / 8;
cnt3 = (cnt - (24*cnt1) - (8*cnt2));
__asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
pxor xmm0, xmm0; // Clear the running sum (accumulator)
jecxz ZERO;
L1:
movupd xmm1, [esi]; //Load from A
movupd xmm2, [esi+16]; //Load from A
movupd xmm3, [esi+32]; //Load from A
movupd xmm4, [edi]; //Load from B
movupd xmm5, [edi+16]; //Load from B
movupd xmm6, [edi+32]; //Load from B
pmaddwd xmm1, xmm4; //Multiply and accumulate
pmaddwd xmm2, xmm5; //Multiply and accumulate
pmaddwd xmm3, xmm6; //Multiply and accumulate
paddd xmm1, xmm3; //Add into accumulator (efficiently)
paddd xmm0, xmm2;
paddd xmm0, xmm1;
add esi, 48;
add edi, 48;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2; // Finish off accumulation with second loop
jecxz ZERO1;
L2:
movupd xmm1, [esi];
movupd xmm2, [edi];
pmaddwd xmm1, xmm2;
paddd xmm0, xmm1;
add esi, 16;
add edi, 16;
loop L2;
ZERO1:
movd ebx, xmm0; //right-hand word to ebx
psrldq xmm0, 4; //left-hand word to right side of xmm0
movd eax, xmm0; //left-hand word into eax
add eax, ebx; //running sum now in eax
mov temp, eax; //move into temp
psrldq xmm0, 4; //left-hand word to right side of xmm0
movd ebx, xmm0; //right-hand word to ebx
psrldq xmm0, 4; //left-hand word to right side of xmm0
movd eax, xmm0; //left-hand word into eax
add eax, ebx; //running sum now in eax
add temp, eax; //move into temp
mov ecx, cnt3;
jecxz ZERO2;
L3: //Really finish off loop with non SIMD instructions
mov bx, [edi]; //Move 16 bits into bx
movsx ebx, bx; //Sign extend to 32 bits
mov ax, [esi]; //Move 16 bits into ax
movsx eax, ax; //Sign extend to 32 bits
imul ebx, eax; //Multiply
add temp2, ebx; //Add into accumulator
add esi, 2;
add edi, 2;
loop L3;
ZERO2:
EMMS; // done with MMX
}
}
else //use faster movapd instruction
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
__asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
pxor xmm0, xmm0; // Clear the running sum (accumulator)
jecxz AZERO;
AL1:
movapd xmm1, [esi]; //Load from A
movapd xmm2, [esi+16]; //Load from A
movapd xmm3, [esi+32]; //Load from A
movapd xmm4, [esi+48]; //Load from A
movapd xmm5, [esi+64]; //Load from A
movapd xmm6, [esi+80]; //Load from A
movapd xmm7, [esi+96]; //Load from A
pmaddwd xmm1, [edi];
pmaddwd xmm2, [edi+16];
pmaddwd xmm3, [edi+32];
pmaddwd xmm4, [edi+48];
pmaddwd xmm5, [edi+64];
pmaddwd xmm6, [edi+80];
pmaddwd xmm7, [edi+96];
paddd xmm0, xmm7;
paddd xmm1, xmm2;
paddd xmm3, xmm4;
paddd xmm5, xmm6;
paddd xmm1, xmm3;
paddd xmm0, xmm5;
paddd xmm0, xmm1;
add esi, 112;
add edi, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2; // Finish off accumulation with second loop
jecxz AZERO1;
AL2:
movapd xmm1, [esi];
pmaddwd xmm1, [edi];
paddd xmm0, xmm1;
add esi, 16;
add edi, 16;
loop AL2;
AZERO1:
movd ebx, xmm0; // right-hand word to ebx
psrldq xmm0, 4; // left-hand word to right side of xmm0
movd eax, xmm0; // left-hand word into eax
add eax, ebx; // running sum now in eax
mov temp, eax; // move into temp
psrldq xmm0, 4;
movd ebx, xmm0; // right-hand word to ebx
psrldq xmm0, 4; // left-hand word to right side of xmm0
movd eax, xmm0; // left-hand word into eax
add eax, ebx; // running sum now in eax
add temp, eax; // add into temp
mov ecx, cnt3;
jecxz AZERO2;
mov ecx, cnt3;
jecxz AZERO2;
AL3: //Really finish off loop with non SIMD instructions
mov bx, [edi]; //Move 16 bits into bx
movsx ebx, bx; //Sign extend to 32 bits
mov ax, [esi]; //Move 16 bits into ax
movsx eax, ax; //Sign extend to 32 bits
imul ebx, eax; //Multiply
add temp2, ebx; //Add into accumulator
add esi, 2;
add edi, 2;
loop AL3;
AZERO2:
EMMS; // done with MMX
}
}
temp += temp2;
return(temp);
}
void sse_add(void *A, void *B, int cnt)
{
//MOV EAX,1 ;request CPU feature flags
//CPUID ;0Fh, 0A2h CPUID instruction
//TEST EDX,4000000h ;test bit 26 (SSE2)
//JNZ >L18 ;SSE2 available
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16)) //unaligned version
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
paddw xmm0, xmm4; //Multiply A*B
paddw xmm1, xmm5; //Multiply A*B
paddw xmm2, xmm6; //Multiply A*B
paddw xmm3, xmm7; //Multiply A*B
movupd [edi], xmm0; //Move into A
movupd [edi+16], xmm1; //Move into A
movupd [edi+32], xmm2; //Move into A
movupd [edi+48], xmm3; //Move into A
add edi, 64;
add esi, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
paddw xmm0, xmm1; //Load from B & multiply A*B
movupd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
add ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
paddw xmm0, [esi]; //Load from B & multiply A*B
paddw xmm1, [esi+16]; //Load from B & multiply A*B
paddw xmm2, [esi+32]; //Load from B & multiply A*B
paddw xmm3, [esi+48]; //Load from B & multiply A*B
paddw xmm4, [esi+64]; //Load from B & multiply A*B
paddw xmm5, [esi+80]; //Load from B & multiply A*B
paddw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
movapd [edi+16], xmm1; //Move into A
movapd [edi+32], xmm2; //Move into A
movapd [edi+48], xmm3; //Move into A
movapd [edi+64], xmm4; //Move into A
movapd [edi+80], xmm5; //Move into A
movapd [edi+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
paddw xmm0, [esi]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
add ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_add(void *A, void *B, void *C, int cnt)
{
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16) || ((int)C%16))
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
paddw xmm0, xmm4; //Multiply A*B
paddw xmm1, xmm5; //Multiply A*B
paddw xmm2, xmm6; //Multiply A*B
paddw xmm3, xmm7; //Multiply A*B
movupd [ebx], xmm0; //Move into C
movupd [ebx+16], xmm1; //Move into C
movupd [ebx+32], xmm2; //Move into C
movupd [ebx+48], xmm3; //Move into C
add edi, 64;
add esi, 64;
add ebx, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
paddw xmm0, xmm1; //Multiply A*B
movupd [ebx], xmm0; //Move into C
add edi, 16;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -