📄 sse.cpp
字号:
add esi, 16;
add ebx, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
add ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
paddw xmm0, [esi]; //Load from B & multiply A*B
paddw xmm1, [esi+16]; //Load from B & multiply A*B
paddw xmm2, [esi+32]; //Load from B & multiply A*B
paddw xmm3, [esi+48]; //Load from B & multiply A*B
paddw xmm4, [esi+64]; //Load from B & multiply A*B
paddw xmm5, [esi+80]; //Load from B & multiply A*B
paddw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [ebx], xmm0; //Move into A
movapd [ebx+16], xmm1; //Move into A
movapd [ebx+32], xmm2; //Move into A
movapd [ebx+48], xmm3; //Move into A
movapd [ebx+64], xmm4; //Move into A
movapd [ebx+80], xmm5; //Move into A
movapd [ebx+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
add ebx, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
paddw xmm0, [esi]; //Load from B & add A+B
movapd [ebx], xmm0; //Move into A
add edi, 16;
add esi, 16;
add ebx, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
add ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_mul(void *A, void *B, int cnt)
{
//MOV EAX,1 ;request CPU feature flags
//CPUID ;0Fh, 0A2h CPUID instruction
//TEST EDX,4000000h ;test bit 26 (SSE2)
//JNZ >L18 ;SSE2 available
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16)) //unaligned version
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
pmullw xmm0, xmm4; //Multiply A*B
pmullw xmm1, xmm5; //Multiply A*B
pmullw xmm2, xmm6; //Multiply A*B
pmullw xmm3, xmm7; //Multiply A*B
movupd [edi], xmm0; //Move into A
movupd [edi+16], xmm1; //Move into A
movupd [edi+32], xmm2; //Move into A
movupd [edi+48], xmm3; //Move into A
add edi, 64;
add esi, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
pmullw xmm0, xmm1; //Load from B & multiply A*B
movupd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
imul ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
pmullw xmm0, [esi]; //Load from B & multiply A*B
pmullw xmm1, [esi+16]; //Load from B & multiply A*B
pmullw xmm2, [esi+32]; //Load from B & multiply A*B
pmullw xmm3, [esi+48]; //Load from B & multiply A*B
pmullw xmm4, [esi+64]; //Load from B & multiply A*B
pmullw xmm5, [esi+80]; //Load from B & multiply A*B
pmullw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
movapd [edi+16], xmm1; //Move into A
movapd [edi+32], xmm2; //Move into A
movapd [edi+48], xmm3; //Move into A
movapd [edi+64], xmm4; //Move into A
movapd [edi+80], xmm5; //Move into A
movapd [edi+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
pmullw xmm0, [esi]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
imul ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_mul(void *A, void *B, void *C, int cnt)
{
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16) || ((int)C%16))
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
pmullw xmm0, xmm4; //Multiply A*B
pmullw xmm1, xmm5; //Multiply A*B
pmullw xmm2, xmm6; //Multiply A*B
pmullw xmm3, xmm7; //Multiply A*B
movupd [ebx], xmm0; //Move into C
movupd [ebx+16], xmm1; //Move into C
movupd [ebx+32], xmm2; //Move into C
movupd [ebx+48], xmm3; //Move into C
add edi, 64;
add esi, 64;
add ebx, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
pmullw xmm0, xmm1; //Multiply A*B
movupd [ebx], xmm0; //Move into C
add edi, 16;
add esi, 16;
add ebx, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
imul ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A, input1
mov esi, B; // Address of B, input2
mov ebx, C; // Address of C, output1
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
pmullw xmm0, [esi]; //Load from B & multiply A*B
pmullw xmm1, [esi+16]; //Load from B & multiply A*B
pmullw xmm2, [esi+32]; //Load from B & multiply A*B
pmullw xmm3, [esi+48]; //Load from B & multiply A*B
pmullw xmm4, [esi+64]; //Load from B & multiply A*B
pmullw xmm5, [esi+80]; //Load from B & multiply A*B
pmullw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [ebx], xmm0; //Move into A
movapd [ebx+16], xmm1; //Move into A
movapd [ebx+32], xmm2; //Move into A
movapd [ebx+48], xmm3; //Move into A
movapd [ebx+64], xmm4; //Move into A
movapd [ebx+80], xmm5; //Move into A
movapd [ebx+96], xmm6; //Move into A
add edi, 112;
add esi, 112;
add ebx, 112;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2;
jecxz AZERO1;
AL2:
movapd xmm0, [edi]; //Load from A
pmullw xmm0, [esi]; //Load from B & add A+B
movapd [ebx], xmm0; //Move into A
add edi, 16;
add esi, 16;
add ebx, 16;
loop AL2;
AZERO1:
mov ecx, cnt3;
jecxz AZERO2;
mov eax, 0;
AL3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
imul ax, [esi];
mov [ebx], ax;
add esi, 2;
add edi, 2;
add ebx, 2;
loop AL3;
AZERO2:
EMMS;
}//end __asm
}//end if
}
void sse_sub(void *A, void *B, int cnt)
{
//MOV EAX,1 ;request CPU feature flags
//CPUID ;0Fh, 0A2h CPUID instruction
//TEST EDX,4000000h ;test bit 26 (SSE2)
//JNZ >L18 ;SSE2 available
int cnt1;
int cnt2;
int cnt3;
if(((int)A%16) || ((int)B%16)) //unaligned version
{
cnt1 = cnt / 32;
cnt2 = (cnt - (32*cnt1)) / 8;
cnt3 = (cnt - (32*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz ZERO;
L1:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [edi+16]; //Load from A
movupd xmm2, [edi+32]; //Load from A
movupd xmm3, [edi+48]; //Load from A
movupd xmm4, [esi]; //Load from B
movupd xmm5, [esi+16]; //Load from B
movupd xmm6, [esi+32]; //Load from B
movupd xmm7, [esi+48]; //Load from B
psubw xmm0, xmm4; //Multiply A*B
psubw xmm1, xmm5; //Multiply A*B
psubw xmm2, xmm6; //Multiply A*B
psubw xmm3, xmm7; //Multiply A*B
movupd [edi], xmm0; //Move into A
movupd [edi+16], xmm1; //Move into A
movupd [edi+32], xmm2; //Move into A
movupd [edi+48], xmm3; //Move into A
add edi, 64;
add esi, 64;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2;
jecxz ZERO1;
L2:
movupd xmm0, [edi]; //Load from A
movupd xmm1, [esi]; //Load from B
psubw xmm0, xmm1; //Load from B & multiply A*B
movupd [edi], xmm0; //Move into A
add edi, 16;
add esi, 16;
loop L2;
ZERO1:
mov ecx, cnt3;
jecxz ZERO2;
mov eax, 0;
L3: //Really finish off loop with non SIMD instructions
mov ax, [edi];
sub ax, [esi];
mov [edi], ax;
add esi, 2;
add edi, 2;
loop L3;
ZERO2:
EMMS;
}//end __asm
}
else
{
cnt1 = cnt / 56;
cnt2 = (cnt - (56*cnt1)) / 8;
cnt3 = (cnt - (56*cnt1) - (8*cnt2));
_asm
{
//Set up for loop
mov edi, A; // Address of A
mov esi, B; // Address of B
mov ecx, cnt1; // Counter
jecxz AZERO;
AL1:
movapd xmm0, [edi]; //Load from A
movapd xmm1, [edi+16]; //Load from A
movapd xmm2, [edi+32]; //Load from A
movapd xmm3, [edi+48]; //Load from A
movapd xmm4, [edi+64]; //Load from A
movapd xmm5, [edi+80]; //Load from A
movapd xmm6, [edi+96]; //Load from A
psubw xmm0, [esi]; //Load from B & multiply A*B
psubw xmm1, [esi+16]; //Load from B & multiply A*B
psubw xmm2, [esi+32]; //Load from B & multiply A*B
psubw xmm3, [esi+48]; //Load from B & multiply A*B
psubw xmm4, [esi+64]; //Load from B & multiply A*B
psubw xmm5, [esi+80]; //Load from B & multiply A*B
psubw xmm6, [esi+96]; //Load from B & multiply A*B
movapd [edi], xmm0; //Move into A
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -