📄 sse.cpp
字号:
int cnt1, cnt2;
cnt1 = cnt / 4;
cnt2 = cnt - (cnt1*4);
short M[8] = {1,-1,1,1,1,-1,1,1};
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, B; // Address of B source2
mov ebx, C; // Address of C output1
mov ecx, cnt1; // Counter
movupd xmm7, M; // Move the multiply thingie
movss xmm6, shift; // Move the round thingie
jecxz ZERO1;
L1:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [edi+8]; //Copy from A
movlpd xmm3, [esi]; //Copy from B
movlpd xmm4, [esi+8]; //Copy from B
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
punpckldq xmm3, xmm3; //Copy low 32 bits to high 32 bits
punpckldq xmm4, xmm4; //Copy low 32 bits to high 32 bits
pshuflw xmm3, xmm3, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pshuflw xmm4, xmm4, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pshufhw xmm3, xmm3, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]
pshufhw xmm4, xmm4, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]
pmullw xmm3, xmm7; //Multiply to get [Re Im -Im Re]
pmullw xmm4, xmm7; //Multiply to get [Re Im -Im Re]
pmaddwd xmm0, xmm3; //Complex multiply and add
pmaddwd xmm1, xmm4; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
psrad xmm1, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 64 bits
packssdw xmm1, xmm1; //Get into low 64 bits
movsd [ebx], xmm0; //Move into C
movsd [ebx+8], xmm1; //Move into C
add edi, 16; //Move in array
add esi, 16; //Move in array
add ebx, 16; //Move in array
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2;
jecxz ZERO2;
L2:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [esi]; //Copy from B
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
pshuflw xmm1, xmm1, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
pmullw xmm1, xmm7; //Multiply to get [Re Im -Im Re]
pmaddwd xmm0, xmm1; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
movd [ebx], xmm0; //Move into A
add edi, 4;
add esi, 4;
add ebx, 4;
loop L2;
ZERO2:
EMMS;
}
}
void sse_crot(void *A, void *B, int cnt, int shift)
{
int cnt1;
int cnt2;
cnt1 = cnt/8;
cnt2 = cnt - (cnt1*8);
short M[8] = {1,-1,1,1,1,-1,1,1}; //To get mmx register into Re Im -Im Re format
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, B; // Address of B source2
mov ecx, cnt1; // Counter
movupd xmm7, M; // Move the multiply thingie
movss xmm6, shift; // Move the round thingie
//Set up the rotation register
movss xmm1, [esi];
pshuflw xmm1, xmm1, 0x14; //Shuffle to get [Re Im Im Re]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Im Re]
pshufd xmm7, xmm1, 0x44; //Now in mm7
jecxz ZERO1;
L1:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [edi+8]; //Copy from A
movlpd xmm2, [edi+16]; //Copy from A
movlpd xmm3, [edi+24]; //Copy from A
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
punpckldq xmm2, xmm2; //Copy low 32 bits to high 32 bits
punpckldq xmm3, xmm3; //Copy low 32 bits to high 32 bits
pmaddwd xmm0, xmm7; //Complex multiply and add
pmaddwd xmm1, xmm7; //Complex multiply and add
pmaddwd xmm2, xmm7; //Complex multiply and add
pmaddwd xmm3, xmm7; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
psrad xmm1, xmm6; //Shift by X bits
psrad xmm2, xmm6; //Shift by X bits
psrad xmm3, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
packssdw xmm1, xmm1; //Get into low 32 bits
packssdw xmm2, xmm2; //Get into low 32 bits
packssdw xmm3, xmm3; //Get into low 32 bits
movsd [edi], xmm0; //Move into A
movsd [edi+8], xmm1; //Move into A
movsd [edi+16], xmm2; //Move into A
movsd [edi+24], xmm3; //Move into A
add edi, 32; //Move in array
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2;
jecxz ZERO2;
L2:
movss xmm0, [edi]; //Copy from A
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
pmaddwd xmm0, xmm7; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
movss [edi], xmm0; //Move into A
add edi, 4; //Move in array
loop L2; // Loop if not done
ZERO2:
EMMS;
}
}
void sse_crot(void *A, void *B, void *C, int cnt, int shift)
{
int cnt1;
int cnt2;
cnt1 = cnt/8;
cnt2 = cnt - (cnt1*8);
short M[8] = {1,-1,1,1,1,-1,1,1}; //To get mmx register into Re Im -Im Re format
__asm
{
//Set up for loop
mov edi, A; // Address of A source1
mov esi, B; // Address of B source2
mov ebx, C; // Address of C input1
mov ecx, cnt1; // Counter
movupd xmm7, M; // Move the multiply thingie
movss xmm6, shift; // Move the round thingie
//Set up the rotation register
movss xmm1, [esi];
pshuflw xmm1, xmm1, 0x14; //Shuffle to get [Re Im Im Re]
pmullw xmm1, xmm7; //Multiply to get [Re -Im Im Re]
pshufd xmm7, xmm1, 0x44; //Now in mm7
jecxz ZERO1;
L1:
movlpd xmm0, [edi]; //Copy from A
movlpd xmm1, [edi+8]; //Copy from A
movlpd xmm2, [edi+16]; //Copy from A
movlpd xmm3, [edi+24]; //Copy from A
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
punpckldq xmm1, xmm1; //Copy low 32 bits to high 32 bits
punpckldq xmm2, xmm2; //Copy low 32 bits to high 32 bits
punpckldq xmm3, xmm3; //Copy low 32 bits to high 32 bits
pmaddwd xmm0, xmm7; //Complex multiply and add
pmaddwd xmm1, xmm7; //Complex multiply and add
pmaddwd xmm2, xmm7; //Complex multiply and add
pmaddwd xmm3, xmm7; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
psrad xmm1, xmm6; //Shift by X bits
psrad xmm2, xmm6; //Shift by X bits
psrad xmm3, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
packssdw xmm1, xmm1; //Get into low 32 bits
packssdw xmm2, xmm2; //Get into low 32 bits
packssdw xmm3, xmm3; //Get into low 32 bits
movsd [ebx], xmm0; //Move into A
movsd [ebx+8], xmm1; //Move into A
movsd [ebx+16], xmm2; //Move into A
movsd [ebx+24], xmm3; //Move into A
add edi, 32; //Move in array
add ebx, 32;
loop L1; // Loop if not done
ZERO1:
mov ecx, cnt2;
jecxz ZERO2;
L2:
movss xmm0, [edi]; //Copy from A
punpckldq xmm0, xmm0; //Copy low 32 bits to high 32 bits
pmaddwd xmm0, xmm7; //Complex multiply and add
psrad xmm0, xmm6; //Shift by X bits
packssdw xmm0, xmm0; //Get into low 32 bits
movss [ebx], xmm0; //Move into A
add edi, 4; //Move in array
add ebx, 4;
loop L2; // Loop if not done
ZERO2:
EMMS;
}
}
void sse_qnt(void *A, int cnt)
{
__int64 hash1 = 0x8000800080008000; //to save the sign bit
__int64 hash2 = 0x0001000100010001; //add the "1" magnitude
unsigned short shash1 = 0x8000;
unsigned short shash2 = 0xfffe;
int cnt1;
int cnt2;
cnt1 = cnt / 40;
cnt2 = (cnt - (40*cnt1));
if(((int)A%16))
{
__asm
{
// Set up for loop
mov edi, A; // Address of A
mov ecx, cnt1; // Counter
movq xmm6, [hash1]; //move in the sign bit hash
movq xmm7, [hash2]; //move in the magnitude hash
pshufd xmm6, xmm6, 0x44;
pshufd xmm7, xmm7, 0x44;
jecxz ZERO;
L1:
movupd xmm0, [edi];
movupd xmm1, [edi+16];
movupd xmm2, [edi+32];
movupd xmm3, [edi+48];
movupd xmm4, [edi+64];
pand xmm0, xmm6; //Clear everything except the sign bit
pand xmm1, xmm6; //Clear everything except the sign bit
pand xmm2, xmm6; //Clear everything except the sign bit
pand xmm3, xmm6; //Clear everything except the sign bit
pand xmm4, xmm6; //Clear everything except the sign bit
pcmpeqw xmm0, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm1, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm2, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm3, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm4, xmm6; //Is the sign bit 1 or 0
por xmm0, xmm7; //Add the magnitude
por xmm1, xmm7; //Add the magnitude
por xmm2, xmm7; //Add the magnitude
por xmm3, xmm7; //Add the magnitude
por xmm4, xmm7; //Add the magnitude
movupd [edi], xmm0; //Move back to the array
movupd [edi+16], xmm1; //Move back to the array
movupd [edi+32], xmm2; //Move back to the array
movupd [edi+48], xmm3; //Move back to the array
movupd [edi+64], xmm4; //Move back to the array
add edi, 80;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2; // Finish off operation with second loop
jecxz ZERO1;
mov ax, shash1;
mov bx, shash2;
L2:
mov dx, [edi];
and dx, ax;
cmp dx, ax;
cmove dx, bx;
add dx, 0x0001;
mov [edi], dx;
add edi, 2;
loop L2;
ZERO1:
EMMS; // done with MMX
}
}
else
{
__asm
{
// Set up for loop
mov edi, A; // Address of A
mov ecx, cnt1; // Counter
movq xmm6, [hash1]; //move in the sign bit hash
movq xmm7, [hash2]; //move in the magnitude hash
pshufd xmm6, xmm6, 0x44;
pshufd xmm7, xmm7, 0x44;
jecxz AZERO;
AL1:
movapd xmm0, [edi];
movapd xmm1, [edi+16];
movapd xmm2, [edi+32];
movapd xmm3, [edi+48];
movapd xmm4, [edi+64];
pand xmm0, xmm6; //Clear everything except the sign bit
pand xmm1, xmm6; //Clear everything except the sign bit
pand xmm2, xmm6; //Clear everything except the sign bit
pand xmm3, xmm6; //Clear everything except the sign bit
pand xmm4, xmm6; //Clear everything except the sign bit
pcmpeqw xmm0, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm1, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm2, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm3, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm4, xmm6; //Is the sign bit 1 or 0
por xmm0, xmm7; //Add the magnitude
por xmm1, xmm7; //Add the magnitude
por xmm2, xmm7; //Add the magnitude
por xmm3, xmm7; //Add the magnitude
por xmm4, xmm7; //Add the magnitude
movapd [edi], xmm0; //Move back to the array
movapd [edi+16], xmm1; //Move back to the array
movapd [edi+32], xmm2; //Move back to the array
movapd [edi+48], xmm3; //Move back to the array
movapd [edi+64], xmm4; //Move back to the array
add edi, 80;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2; // Finish off operation with second loop
jecxz AZERO1;
mov ax, shash1;
mov bx, shash2;
AL2:
mov dx, [edi];
and dx, ax;
cmp dx, ax;
cmove dx, bx;
add dx, 0x0001;
mov [edi], dx;
add edi, 2;
loop AL2;
AZERO1:
EMMS; // done with MMX
}
}
}
void sse_qnt(void *A, void *C, int cnt)
{
__int64 hash1 = 0x8000800080008000; //to save the sign bit
__int64 hash2 = 0x0001000100010001; //add the "1" magnitude
unsigned short shash1 = 0x8000;
unsigned short shash2 = 0xfffe;
int cnt1;
int cnt2;
cnt1 = cnt / 40;
cnt2 = (cnt - (40*cnt1));
if(((int)A%16) || ((int)C%16))
{
__asm
{
// Set up for loop
mov edi, A; // Address of A
mov esi, C; // Address of C
mov ecx, cnt1; // Counter
movq xmm6, [hash1]; //move in the sign bit hash
movq xmm7, [hash2]; //move in the magnitude hash
pshufd xmm6, xmm6, 0x44;
pshufd xmm7, xmm7, 0x44;
jecxz ZERO;
L1:
movupd xmm0, [edi];
movupd xmm1, [edi+16];
movupd xmm2, [edi+32];
movupd xmm3, [edi+48];
movupd xmm4, [edi+64];
pand xmm0, xmm6; //Clear everything except the sign bit
pand xmm1, xmm6; //Clear everything except the sign bit
pand xmm2, xmm6; //Clear everything except the sign bit
pand xmm3, xmm6; //Clear everything except the sign bit
pand xmm4, xmm6; //Clear everything except the sign bit
pcmpeqw xmm0, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm1, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm2, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm3, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm4, xmm6; //Is the sign bit 1 or 0
por xmm0, xmm7; //Add the magnitude
por xmm1, xmm7; //Add the magnitude
por xmm2, xmm7; //Add the magnitude
por xmm3, xmm7; //Add the magnitude
por xmm4, xmm7; //Add the magnitude
movupd [esi], xmm0; //Move back to the array
movupd [esi+16], xmm1; //Move back to the array
movupd [esi+32], xmm2; //Move back to the array
movupd [esi+48], xmm3; //Move back to the array
movupd [esi+64], xmm4; //Move back to the array
add edi, 80;
add esi, 80;
loop L1; // Loop if not done
ZERO:
mov ecx, cnt2; // Finish off operation with second loop
jecxz ZERO1;
mov ax, shash1;
mov bx, shash2;
L2:
mov dx, [edi];
and dx, ax;
cmp dx, ax;
cmove dx, bx;
add dx, 0x0001;
mov [esi], dx;
add edi, 2;
add esi, 2;
loop L2;
ZERO1:
EMMS; // done with MMX
}
}
else
{
__asm
{
// Set up for loop
mov edi, A; // Address of A
mov esi, C; // Address of C
mov ecx, cnt1; // Counter
movq xmm6, [hash1]; //move in the sign bit hash
movq xmm7, [hash2]; //move in the magnitude hash
pshufd xmm6, xmm6, 0x44;
pshufd xmm7, xmm7, 0x44;
jecxz AZERO;
AL1:
movapd xmm0, [edi];
movapd xmm1, [edi+16];
movapd xmm2, [edi+32];
movapd xmm3, [edi+48];
movapd xmm4, [edi+64];
pand xmm0, xmm6; //Clear everything except the sign bit
pand xmm1, xmm6; //Clear everything except the sign bit
pand xmm2, xmm6; //Clear everything except the sign bit
pand xmm3, xmm6; //Clear everything except the sign bit
pand xmm4, xmm6; //Clear everything except the sign bit
pcmpeqw xmm0, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm1, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm2, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm3, xmm6; //Is the sign bit 1 or 0
pcmpeqw xmm4, xmm6; //Is the sign bit 1 or 0
por xmm0, xmm7; //Add the magnitude
por xmm1, xmm7; //Add the magnitude
por xmm2, xmm7; //Add the magnitude
por xmm3, xmm7; //Add the magnitude
por xmm4, xmm7; //Add the magnitude
movapd [esi], xmm0; //Move back to the array
movapd [esi+16], xmm1; //Move back to the array
movapd [esi+32], xmm2; //Move back to the array
movapd [esi+48], xmm3; //Move back to the array
movapd [esi+64], xmm4; //Move back to the array
add edi, 80;
add esi, 80;
loop AL1; // Loop if not done
AZERO:
mov ecx, cnt2; // Finish off operation with second loop
jecxz AZERO1;
mov ax, shash1;
mov bx, shash2;
AL2:
mov dx, [edi];
and dx, ax;
cmp dx, ax;
cmove dx, bx;
add dx, 0x0001;
mov [esi], dx;
add edi, 2;
add esi, 2;
loop AL2;
AZERO1:
EMMS; // done with MMX
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -