📄 sse.cpp

📁 这是本人编写的软件接收机
💻 CPP
📖 第 1 页 / 共 4 页
字号:
				movapd [edi+16],  xmm1;		//Move into A
				movapd [edi+32],  xmm2;		//Move into A
				movapd [edi+48],  xmm3;		//Move into A
				movapd [edi+64],  xmm4;		//Move into A
				movapd [edi+80],  xmm5;		//Move into A
				movapd [edi+96],  xmm6;		//Move into A

				add edi, 112;
				add esi, 112;
				
			loop AL1;							// Loop if not done

	AZERO:
			mov ecx, cnt2;
			jecxz AZERO1;

			AL2:

				movapd xmm0, [edi];		//Load from A
				psubw xmm0, [esi];		//Load from B & multiply A*B
				movapd [edi], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;

			loop AL2;	

	AZERO1:

			mov ecx, cnt3;
			jecxz AZERO2;

			mov eax, 0;

			AL3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				sub ax, [esi];
				mov [edi], ax;
				add esi, 2;
				add edi, 2;

			loop AL3;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if
}
















void sse_sub(void *A, void *B, void *C, int cnt)
{

	int cnt1;
	int cnt2;
	int cnt3;


	if(((int)A%16) || ((int)B%16) || ((int)C%16))
	{

		cnt1 = cnt / 32;
		cnt2 = (cnt - (32*cnt1)) / 8;
		cnt3 = (cnt - (32*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A, input1
			mov esi, B;	// Address of B, input2
			mov ebx, C; // Address of C, output1
			mov ecx, cnt1;	// Counter
			jecxz ZERO;

			L1:


				movupd xmm0, [edi];			//Load from A
				movupd xmm1, [edi+16];		//Load from A
				movupd xmm2, [edi+32];		//Load from A
				movupd xmm3, [edi+48];		//Load from A
				movupd xmm4, [esi];			//Load from B
				movupd xmm5, [esi+16];		//Load from B
				movupd xmm6, [esi+32];		//Load from B
				movupd xmm7, [esi+48];		//Load from B

				psubw xmm0, xmm4;			//Multiply A*B
				psubw xmm1, xmm5;			//Multiply A*B
				psubw xmm2, xmm6;			//Multiply A*B
				psubw xmm3, xmm7;			//Multiply A*B

				movupd [ebx],	  xmm0;		//Move into C
				movupd [ebx+16],  xmm1;		//Move into C
				movupd [ebx+32],  xmm2;		//Move into C
				movupd [ebx+48],  xmm3;		//Move into C

				add edi, 64;
				add esi, 64;
				add ebx, 64;
				
			loop L1;							// Loop if not done

	ZERO:
			mov ecx, cnt2;
			jecxz ZERO1;

			L2:

				movupd xmm0, [edi];		//Load from A
				movupd xmm1, [esi];		//Load from B
				psubw  xmm0, xmm1;		//Multiply A*B
				movupd [ebx], xmm0;		//Move into C
				add edi, 16;
				add esi, 16;
				add ebx, 16;

			loop L2;	

	ZERO1:

			mov ecx, cnt3;
			jecxz ZERO2;

			mov eax, 0;

			L3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				sub ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop L3;

	ZERO2:

			EMMS;

		}//end __asm

	}
	else
	{

		cnt1 = cnt / 56;
		cnt2 = (cnt - (56*cnt1)) / 8;
		cnt3 = (cnt - (56*cnt1) - (8*cnt2));

		_asm 
		{
			//Set up for loop
			mov edi, A;	// Address of A, input1
			mov esi, B;	// Address of B, input2
			mov ebx, C; // Address of C, output1
			mov ecx, cnt1;	// Counter
			jecxz AZERO;

			AL1:

				movapd	xmm0, [edi];		//Load from A
				movapd	xmm1, [edi+16];		//Load from A
				movapd	xmm2, [edi+32];		//Load from A
				movapd	xmm3, [edi+48];		//Load from A
				movapd	xmm4, [edi+64];		//Load from A
				movapd	xmm5, [edi+80];		//Load from A
				movapd	xmm6, [edi+96];		//Load from A

				psubw xmm0, [esi];			//Load from B & multiply A*B
				psubw xmm1, [esi+16];		//Load from B & multiply A*B
				psubw xmm2, [esi+32];		//Load from B & multiply A*B
				psubw xmm3, [esi+48];		//Load from B & multiply A*B
				psubw xmm4, [esi+64];		//Load from B & multiply A*B
				psubw xmm5, [esi+80];		//Load from B & multiply A*B
				psubw xmm6, [esi+96];		//Load from B & multiply A*B

				movapd [ebx],	  xmm0;		//Move into A
				movapd [ebx+16],  xmm1;		//Move into A
				movapd [ebx+32],  xmm2;		//Move into A
				movapd [ebx+48],  xmm3;		//Move into A
				movapd [ebx+64],  xmm4;		//Move into A
				movapd [ebx+80],  xmm5;		//Move into A
				movapd [ebx+96],  xmm6;		//Move into A

				add edi, 112;
				add esi, 112;
				add ebx, 112;
				
			loop AL1;							// Loop if not done

	AZERO:
			mov ecx, cnt2;
			jecxz AZERO1;

			AL2:

				movapd  xmm0, [edi];		//Load from A
				psubw  xmm0, [esi];		//Load from B & add A+B
				movapd [ebx], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;
				add ebx, 16;

			loop AL2;	

	AZERO1:

			mov ecx, cnt3;
			jecxz AZERO2;

			mov eax, 0;

			AL3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				sub ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop AL3;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if
}

















void sse_conj(void *A, int cnt)
{

	int cnt1, cnt2;
	
	cnt1 = cnt/28;
	cnt2 = cnt - 28*cnt1;

	short M[8] = {1,-1,1,-1,1,-1,1,-1};

	if((int)A%16)
	{
		__asm 
		{
			
			//Set up for loop
			mov edi, A;			// Address of A	source1
			mov ecx, cnt1;		// Counter
			movq mm7, M;		// Move the multiply thingie
			movupd xmm7, M;		// Move the multiply thingie
			jecxz ZERO1;

			L1:

				movupd		xmm0, [edi];	//Load from A
				movupd		xmm1, [edi+16]; //Load from A
				movupd		xmm2, [edi+32]; //Load from A
				movupd		xmm3, [edi+48]; //Load from A
				movupd		xmm4, [edi+64]; //Load from A
				movupd		xmm5, [edi+80]; //Load from A
				movupd		xmm6, [edi+96]; //Load from A

				pmullw		xmm0, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm1, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm2, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm3, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm4, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm5, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm6, xmm7;		//Multiply to get [Re -Im Re -Im]

				movupd		[edi],    xmm0;	//Move into A
				movupd		[edi+16], xmm1;	//Move into A
				movupd		[edi+32], xmm2;	//Move into A
				movupd		[edi+48], xmm3;	//Move into A
				movupd		[edi+64], xmm4;	//Move into A
				movupd		[edi+80], xmm5;	//Move into A
				movupd		[edi+96], xmm6;	//Move into A

				add			edi, 112;		//Move in array
			
			loop L1;							// Loop if not done

	ZERO1:

			mov ecx, cnt2;		// Counter
			jecxz ZERO2;

			L2:

				movd	mm0, [edi];
				pmullw	mm0, mm7;
				movd	[edi], mm0;

				add		edi, 4;

			loop L2;

	ZERO2:

			EMMS;

		}//end __asm
	}
	else
	{
		__asm 
		{
			
			//Set up for loop
			mov edi, A;			// Address of A	source1
			mov ecx, cnt1;		// Counter
			movq mm7, M;		// Move the multiply thingie
			movupd xmm7, M;		// Move the multiply thingie
			jecxz AZERO1;

			AL1:

				movapd		xmm0, [edi];	//Load from A
				movapd		xmm1, [edi+16]; //Load from A
				movapd		xmm2, [edi+32]; //Load from A
				movapd		xmm3, [edi+48]; //Load from A
				movapd		xmm4, [edi+64]; //Load from A
				movapd		xmm5, [edi+80]; //Load from A
				movapd		xmm6, [edi+96]; //Load from A

				pmullw		xmm0, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm1, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm2, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm3, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm4, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm5, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm6, xmm7;		//Multiply to get [Re -Im Re -Im]

				movapd		[edi],    xmm0;	//Move into A
				movapd		[edi+16], xmm1;	//Move into A
				movapd		[edi+32], xmm2;	//Move into A
				movapd		[edi+48], xmm3;	//Move into A
				movapd		[edi+64], xmm4;	//Move into A
				movapd		[edi+80], xmm5;	//Move into A
				movapd		[edi+96], xmm6;	//Move into A

				add			edi, 112;		//Move in array
			
			loop AL1;							// Loop if not done

	AZERO1:

			mov ecx, cnt2;		// Counter
			jecxz AZERO2;

			AL2:

				movd	mm0, [edi];
				pmullw	mm0, mm7;
				movd	[edi], mm0;

				add		edi, 4;

			loop AL2;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if

}



void sse_conj(void *A, void *C, int cnt)
{

	int cnt1, cnt2;
	
	cnt1 = cnt/28;
	cnt2 = cnt - 28*cnt1;

	short M[8] = {1,-1,1,-1,1,-1,1,-1};

	if(((int)A%16) || ((int)C%16))
	{
		__asm 
		{
			
			//Set up for loop
			mov edi, A;			// Address of A	source1
			mov esi, C;			// Address of C output1
			mov ecx, cnt1;		// Counter
			movq mm7, M;		// Move the multiply thingie
			movupd xmm7, M;		// Move the multiply thingie
			jecxz ZERO1;

			L1:

				movupd		xmm0, [edi];	//Load from A
				movupd		xmm1, [edi+16]; //Load from A
				movupd		xmm2, [edi+32]; //Load from A
				movupd		xmm3, [edi+48]; //Load from A
				movupd		xmm4, [edi+64]; //Load from A
				movupd		xmm5, [edi+80]; //Load from A
				movupd		xmm6, [edi+96]; //Load from A

				pmullw		xmm0, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm1, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm2, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm3, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm4, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm5, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm6, xmm7;		//Multiply to get [Re -Im Re -Im]

				movupd		[esi],    xmm0;	//Move into A
				movupd		[esi+16], xmm1;	//Move into A
				movupd		[esi+32], xmm2;	//Move into A
				movupd		[esi+48], xmm3;	//Move into A
				movupd		[esi+64], xmm4;	//Move into A
				movupd		[esi+80], xmm5;	//Move into A
				movupd		[esi+96], xmm6;	//Move into A

				add			edi, 112;		//Move in array
				add			esi, 112;
			
			loop L1;							// Loop if not done

	ZERO1:

			mov ecx, cnt2;		// Counter
			jecxz ZERO2;

			L2:

				movd	mm0, [edi];
				pmullw	mm0, mm7;
				movd	[esi], mm0;

				add		edi, 4;
				add		esi, 4;

			loop L2;

	ZERO2:

			EMMS;

		}//end __asm
	}
	else
	{
		__asm 
		{
			
			//Set up for loop
			mov edi, A;			// Address of A	source1
			mov esi, C;			// Address of C output1
			mov ecx, cnt1;		// Counter
			movq mm7, M;		// Move the multiply thingie
			movupd xmm7, M;		// Move the multiply thingie
			jecxz AZERO1;

			AL1:

				movapd		xmm0, [edi];	//Load from A
				movapd		xmm1, [edi+16]; //Load from A
				movapd		xmm2, [edi+32]; //Load from A
				movapd		xmm3, [edi+48]; //Load from A
				movapd		xmm4, [edi+64]; //Load from A
				movapd		xmm5, [edi+80]; //Load from A
				movapd		xmm6, [edi+96]; //Load from A

				pmullw		xmm0, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm1, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm2, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm3, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm4, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm5, xmm7;		//Multiply to get [Re -Im Re -Im]
				pmullw		xmm6, xmm7;		//Multiply to get [Re -Im Re -Im]

				movapd		[esi],    xmm0;	//Move into A
				movapd		[esi+16], xmm1;	//Move into A
				movapd		[esi+32], xmm2;	//Move into A
				movapd		[esi+48], xmm3;	//Move into A
				movapd		[esi+64], xmm4;	//Move into A
				movapd		[esi+80], xmm5;	//Move into A
				movapd		[esi+96], xmm6;	//Move into A

				add			edi, 112;		//Move in array
				add			esi, 112;		//Move in array
			
			loop AL1;							// Loop if not done

	AZERO1:

			mov ecx, cnt2;		// Counter
			jecxz AZERO2;

			AL2:

				movd	mm0, [edi];
				pmullw	mm0, mm7;
				movd	[esi], mm0;

				add		edi, 4;
				add		esi, 4;

			loop AL2;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if

}



void sse_cmul(void *A, void *B, int cnt, int shift)
{

	int cnt1, cnt2;

	cnt1 = cnt / 4;
	cnt2 = cnt - (cnt1*4);

	short M[8] = {1,-1,1,1,1,-1,1,1};


	__asm 
	{
		
		//Set up for loop
		mov edi, A;			// Address of A	source1
		mov esi, B;			// Address of B	source2
		mov ecx, cnt1;		// Counter
		movupd xmm7, M;		// Move the multiply thingie
		movss  xmm6, shift;	// Move the round thingie
		jecxz ZERO1;

		L1:

			movlpd xmm0, [edi];		//Copy from A
			movlpd xmm1, [edi+8];	//Copy from A

			movlpd xmm3, [esi];		//Copy from B
			movlpd xmm4, [esi+8];	//Copy from B

			punpckldq xmm0, xmm0;	//Copy low 32 bits to high 32 bits
			punpckldq xmm1, xmm1;	//Copy low 32 bits to high 32 bits

			punpckldq xmm3, xmm3;	//Copy low 32 bits to high 32 bits
			punpckldq xmm4, xmm4;	//Copy low 32 bits to high 32 bits

			pshuflw xmm3, xmm3, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]
			pshuflw xmm4, xmm4, 0x14; //Shuffle Low 64 bits to get [Re Im Im Re]

			pshufhw xmm3, xmm3, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]
			pshufhw xmm4, xmm4, 0x14; //Shuffle High 64 bits to get [Re Im Im Re]

			pmullw xmm3, xmm7;		//Multiply to get [Re Im -Im Re]
			pmullw xmm4, xmm7;		//Multiply to get [Re Im -Im Re]

			pmaddwd xmm0, xmm3;		//Complex multiply and add
			pmaddwd xmm1, xmm4;		//Complex multiply and add

			psrad xmm0, xmm6;		//Shift by X bits
			psrad xmm1, xmm6;		//Shift by X bits

			packssdw xmm0, xmm0;	//Get into low 64 bits
			packssdw xmm1, xmm1;	//Get into low 64 bits

			movsd [edi],   xmm0;	//Move into A
			movsd [edi+8], xmm1;	//Move into A

			add edi, 16;			//Move in array
			add esi, 16;			//Move in array
		
		loop L1;							// Loop if not done

ZERO1:

		mov ecx, cnt2;
		jecxz ZERO2;

L2:

			movlpd		xmm0, [edi];		//Copy from A
			movlpd		xmm1, [esi];		//Copy from B

			punpckldq	xmm0, xmm0;			//Copy low 32 bits to high 32 bits
			punpckldq	xmm1, xmm1;			//Copy low 32 bits to high 32 bits

			pshuflw		xmm1, xmm1, 0x14;	//Shuffle Low 64 bits to get [Re Im Im Re]
			pmullw		xmm1, xmm7;			//Multiply to get [Re Im -Im Re]
			pmaddwd		xmm0, xmm1;			//Complex multiply and add
			psrad		xmm0, xmm6;			//Shift by X bits
			packssdw	xmm0, xmm0;			//Get into low 32 bits
			movd		[edi], xmm0;		//Move into A

			add edi, 4;
			add esi, 4;

			loop L2;


ZERO2:
		EMMS;

	}


}



void sse_cmul(void *A, void *B, void *C, int cnt, int shift)
{
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -