📄 sse.cpp
字号:
movapd [edi+16], xmm1; //Move into A
movapd [edi+32], xmm2; //Move into A
movapd [edi+48], xmm3; //Move into A
movapd [edi+64], xmm4; //Move into A
movapd [edi+80], xmm5; //Move into A
movapd [edi+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
psubw xmm0, [esi]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
sub ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_sub(void *A, void *B, void *C, int cnt)
{
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16) || ((int)C%16))
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
psubw xmm0, xmm4; //Multiply A*B
psubw xmm1, xmm5; //Multiply A*B
psubw xmm2, xmm6; //Multiply A*B
psubw xmm3, xmm7; //Multiply A*B
movupd [ebx], xmm0; //Move into C
movupd [ebx+16], xmm1; //Move into C
movupd [ebx+32], xmm2; //Move into C
movupd [ebx+48], xmm3; //Move into C
add edi, 64;
add esi, 64;
add ebx, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
psubw xmm0, xmm1; //Multiply A*B
movupd [ebx], xmm0; //Move into C
add edi, 16;
add esi, 16;
add ebx, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
sub ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
psubw xmm0, [esi]; //Load from B & multiply A*B
psubw xmm1, [esi+16]; //Load from B & multiply A*B
psubw xmm2, [esi+32]; //Load from B & multiply A*B
psubw xmm3, [esi+48]; //Load from B & multiply A*B
psubw xmm4, [esi+64]; //Load from B & multiply A*B
psubw xmm5, [esi+80]; //Load from B & multiply A*B
psubw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [ebx], xmm0; //Move into A
movapd [ebx+16], xmm1; //Move into A
movapd [ebx+32], xmm2; //Move into A
movapd [ebx+48], xmm3; //Move into A
movapd [ebx+64], xmm4; //Move into A
movapd [ebx+80], xmm5; //Move into A
movapd [ebx+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
add ebx, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
psubw xmm0, [esi]; //Load from B & add A+B
movapd [ebx], xmm0; //Move into A
add edi, 16;
add esi, 16;
add ebx, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
sub ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_conj(void *A, int cnt)
{
int cnt1, cnt2;
cnt1 = cnt/28;
cnt2 = cnt - 28*cnt1;
short M[8] = {1,-1,1,-1,1,-1,1,-1};
if((int)A%16)
{
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov ecx, cnt1; // Counter
movq mm7, M; // Move the multiply thingie
movupd xmm7, M; // Move the multiply thingie
jecxz ZERO1;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [edi+64]; //Load from A
movupd xmm5, [edi+80]; //Load from A
movupd xmm6, [edi+96]; //Load from A
pmullw xmm0, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm2, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm3, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm4, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm5, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm6, xmm7; //Multiply to get [Re -Im Re -Im]
movupd [edi], xmm0; //Move into A
movupd [edi+16], xmm1; //Move into A
movupd [edi+32], xmm2; //Move into A
movupd [edi+48], xmm3; //Move into A
movupd [edi+64], xmm4; //Move into A
movupd [edi+80], xmm5; //Move into A
movupd [edi+96], xmm6; //Move into A
add edi, 112; //Move in array
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2; // Counter
jecxz ZERO2;
L2:
movd mm0, [edi];
pmullw mm0, mm7;
movd [edi], mm0;
add edi, 4;
loop L2;
ZERO2:
EMMS;
}//end __asm
}
else
{
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov ecx, cnt1; // Counter
movq mm7, M; // Move the multiply thingie
movupd xmm7, M; // Move the multiply thingie
jecxz AZERO1;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
pmullw xmm0, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm2, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm3, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm4, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm5, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm6, xmm7; //Multiply to get [Re -Im Re -Im]
movapd [edi], xmm0; //Move into A
movapd [edi+16], xmm1; //Move into A
movapd [edi+32], xmm2; //Move into A
movapd [edi+48], xmm3; //Move into A
movapd [edi+64], xmm4; //Move into A
movapd [edi+80], xmm5; //Move into A
movapd [edi+96], xmm6; //Move into A
add edi, 112; //Move in array
loop AL1; // Loop if not done
AZERO1:
mov ecx, cnt2; // Counter
jecxz AZERO2;
AL2:
movd mm0, [edi];
pmullw mm0, mm7;
movd [edi], mm0;
add edi, 4;
loop AL2;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_conj(void *A, void *C, int cnt)
{
int cnt1, cnt2;
cnt1 = cnt/28;
cnt2 = cnt - 28*cnt1;
short M[8] = {1,-1,1,-1,1,-1,1,-1};
if(((int)A%16) || ((int)C%16))
{
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, C; // Address of C output1
mov ecx, cnt1; // Counter
movq mm7, M; // Move the multiply thingie
movupd xmm7, M; // Move the multiply thingie
jecxz ZERO1;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [edi+64]; //Load from A
movupd xmm5, [edi+80]; //Load from A
movupd xmm6, [edi+96]; //Load from A
pmullw xmm0, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm2, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm3, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm4, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm5, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm6, xmm7; //Multiply to get [Re -Im Re -Im]
movupd [esi], xmm0; //Move into A
movupd [esi+16], xmm1; //Move into A
movupd [esi+32], xmm2; //Move into A
movupd [esi+48], xmm3; //Move into A
movupd [esi+64], xmm4; //Move into A
movupd [esi+80], xmm5; //Move into A
movupd [esi+96], xmm6; //Move into A
add edi, 112; //Move in array
add esi, 112;
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2; // Counter
jecxz ZERO2;
L2:
movd mm0, [edi];
pmullw mm0, mm7;
movd [esi], mm0;
add edi, 4;
add esi, 4;
loop L2;
ZERO2:
EMMS;
}//end __asm
}
else
{
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, C; // Address of C output1
mov ecx, cnt1; // Counter
movq mm7, M; // Move the multiply thingie
movupd xmm7, M; // Move the multiply thingie
jecxz AZERO1;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
pmullw xmm0, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm2, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm3, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm4, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm5, xmm7; //Multiply to get [Re -Im Re -Im]
pmullw xmm6, xmm7; //Multiply to get [Re -Im Re -Im]
movapd [esi], xmm0; //Move into A
movapd [esi+16], xmm1; //Move into A
movapd [esi+32], xmm2; //Move into A
movapd [esi+48], xmm3; //Move into A
movapd [esi+64], xmm4; //Move into A
movapd [esi+80], xmm5; //Move into A
movapd [esi+96], xmm6; //Move into A
add edi, 112; //Move in array
add esi, 112; //Move in array
loop AL1; // Loop if not done
AZERO1:
mov ecx, cnt2; // Counter
jecxz AZERO2;
AL2:
movd mm0, [edi];
pmullw mm0, mm7;
movd [esi], mm0;
add edi, 4;
add esi, 4;
loop AL2;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_cmul(void *A, void *B, int cnt, int shift)
{
int cnt1, cnt2;
cnt1 = cnt / 4;
cnt2 = cnt - (cnt1*4);
short M[8] = {1,-1,1,1,1,-1,1,1};
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, B; // Address of B source2
mov ecx, cnt1; // Counter
movupd xmm7, M; // Move the multiply thingie
movss xmm6, shift; // Move the round thingie
jecxz ZERO1;
L1:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [edi+8]; //Copy from A
movlpd xmm3, [esi]; //Copy from B
movlpd xmm4, [esi+8]; //Copy from B
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
punpckldq xmm3, xmm3; //Copy low 32 bits to high 32 bits
punpckldq xmm4, xmm4; //Copy low 32 bits to high 32 bits
pshuflw xmm3, xmm3, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pshuflw xmm4, xmm4, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pshufhw xmm3, xmm3, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]
pshufhw xmm4, xmm4, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]
pmullw xmm3, xmm7; //Multiply to get [Re Im -Im Re]
pmullw xmm4, xmm7; //Multiply to get [Re Im -Im Re]
pmaddwd xmm0, xmm3; //Complex multiply and add
pmaddwd xmm1, xmm4; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
psrad xmm1, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 64 bits
packssdw xmm1, xmm1; //Get into low 64 bits
movsd [edi], xmm0; //Move into A
movsd [edi+8], xmm1; //Move into A
add edi, 16; //Move in array
add esi, 16; //Move in array
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2;
jecxz ZERO2;
L2:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [esi]; //Copy from B
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
pshuflw xmm1, xmm1, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pmullw xmm1, xmm7; //Multiply to get [Re Im -Im Re]
pmaddwd xmm0, xmm1; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
movd [edi], xmm0; //Move into A
add edi, 4;
add esi, 4;
loop L2;
ZERO2:
EMMS;
}
}
void sse_cmul(void *A, void *B, void *C, int cnt, int shift)
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -