📄 sse.cpp

📁 这是本人编写的软件接收机
💻 CPP
📖 第 1 页 / 共 4 页
字号:
				add esi, 16;
				add ebx, 16;

			loop L2;	

	ZERO1:

			mov ecx, cnt3;
			jecxz ZERO2;

			mov eax, 0;

			L3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				add ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop L3;

	ZERO2:

			EMMS;

		}//end __asm

	}
	else
	{

		cnt1 = cnt / 56;
		cnt2 = (cnt - (56*cnt1)) / 8;
		cnt3 = (cnt - (56*cnt1) - (8*cnt2));

		_asm 
		{
			//Set up for loop
			mov edi, A;	// Address of A, input1
			mov esi, B;	// Address of B, input2
			mov ebx, C; // Address of C, output1
			mov ecx, cnt1;	// Counter
			jecxz AZERO;

			AL1:

				movapd	xmm0, [edi];		//Load from A
				movapd	xmm1, [edi+16];		//Load from A
				movapd	xmm2, [edi+32];		//Load from A
				movapd	xmm3, [edi+48];		//Load from A
				movapd	xmm4, [edi+64];		//Load from A
				movapd	xmm5, [edi+80];		//Load from A
				movapd	xmm6, [edi+96];		//Load from A

				paddw xmm0, [esi];			//Load from B & multiply A*B
				paddw xmm1, [esi+16];		//Load from B & multiply A*B
				paddw xmm2, [esi+32];		//Load from B & multiply A*B
				paddw xmm3, [esi+48];		//Load from B & multiply A*B
				paddw xmm4, [esi+64];		//Load from B & multiply A*B
				paddw xmm5, [esi+80];		//Load from B & multiply A*B
				paddw xmm6, [esi+96];		//Load from B & multiply A*B

				movapd [ebx],	  xmm0;		//Move into A
				movapd [ebx+16],  xmm1;		//Move into A
				movapd [ebx+32],  xmm2;		//Move into A
				movapd [ebx+48],  xmm3;		//Move into A
				movapd [ebx+64],  xmm4;		//Move into A
				movapd [ebx+80],  xmm5;		//Move into A
				movapd [ebx+96],  xmm6;		//Move into A

				add edi, 112;
				add esi, 112;
				add ebx, 112;
				
			loop AL1;							// Loop if not done

	AZERO:
			mov ecx, cnt2;
			jecxz AZERO1;

			AL2:

				movapd  xmm0, [edi];		//Load from A
				paddw  xmm0, [esi];		//Load from B & add A+B
				movapd [ebx], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;
				add ebx, 16;

			loop AL2;	

	AZERO1:

			mov ecx, cnt3;
			jecxz AZERO2;

			mov eax, 0;

			AL3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				add ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop AL3;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if
}
















void sse_mul(void *A, void *B, int cnt)
{

	//MOV EAX,1               ;request CPU feature flags
	//CPUID                   ;0Fh, 0A2h CPUID instruction
	//TEST EDX,4000000h       ;test bit 26 (SSE2)
	//JNZ >L18                ;SSE2 available

	int cnt1;
	int cnt2;
	int cnt3;

	if(((int)A%16) || ((int)B%16)) //unaligned version
	{

		cnt1 = cnt / 32;
		cnt2 = (cnt - (32*cnt1)) / 8;
		cnt3 = (cnt - (32*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A
			mov esi, B;	// Address of B
			mov ecx, cnt1;	// Counter
			jecxz ZERO;

			L1:

				movupd xmm0, [edi];			//Load from A
				movupd xmm1, [edi+16];		//Load from A
				movupd xmm2, [edi+32];		//Load from A
				movupd xmm3, [edi+48];		//Load from A

				movupd xmm4, [esi];			//Load from B
				movupd xmm5, [esi+16];		//Load from B
				movupd xmm6, [esi+32];		//Load from B
				movupd xmm7, [esi+48];		//Load from B

				pmullw xmm0, xmm4;			//Multiply A*B
				pmullw xmm1, xmm5;			//Multiply A*B
				pmullw xmm2, xmm6;			//Multiply A*B
				pmullw xmm3, xmm7;			//Multiply A*B

				movupd [edi],	  xmm0;		//Move into A
				movupd [edi+16],  xmm1;		//Move into A
				movupd [edi+32],  xmm2;		//Move into A
				movupd [edi+48],  xmm3;		//Move into A

				add edi, 64;
				add esi, 64;
				
			loop L1;							// Loop if not done

	ZERO:
			mov ecx, cnt2;
			jecxz ZERO1;

			L2:

				movupd xmm0, [edi];		//Load from A
				movupd xmm1, [esi];		//Load from B
				pmullw xmm0, xmm1;		//Load from B & multiply A*B
				movupd [edi], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;

			loop L2;	

	ZERO1:

			mov ecx, cnt3;
			jecxz ZERO2;

			mov eax, 0;

			L3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				imul ax, [esi];
				mov [edi], ax;
				add esi, 2;
				add edi, 2;

			loop L3;

	ZERO2:

			EMMS;

		}//end __asm

	}
	else
	{

		cnt1 = cnt / 56;
		cnt2 = (cnt - (56*cnt1)) / 8;
		cnt3 = (cnt - (56*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A
			mov esi, B;	// Address of B
			mov ecx, cnt1;	// Counter
			jecxz AZERO;

			AL1:

				movapd xmm0, [edi];		//Load from A
				movapd xmm1, [edi+16];		//Load from A
				movapd xmm2, [edi+32];		//Load from A
				movapd xmm3, [edi+48];		//Load from A
				movapd xmm4, [edi+64];		//Load from A
				movapd xmm5, [edi+80];		//Load from A
				movapd xmm6, [edi+96];		//Load from A

				pmullw xmm0, [esi];			//Load from B & multiply A*B
				pmullw xmm1, [esi+16];		//Load from B & multiply A*B
				pmullw xmm2, [esi+32];		//Load from B & multiply A*B
				pmullw xmm3, [esi+48];		//Load from B & multiply A*B
				pmullw xmm4, [esi+64];		//Load from B & multiply A*B
				pmullw xmm5, [esi+80];		//Load from B & multiply A*B
				pmullw xmm6, [esi+96];		//Load from B & multiply A*B

				movapd [edi],	  xmm0;		//Move into A
				movapd [edi+16],  xmm1;		//Move into A
				movapd [edi+32],  xmm2;		//Move into A
				movapd [edi+48],  xmm3;		//Move into A
				movapd [edi+64],  xmm4;		//Move into A
				movapd [edi+80],  xmm5;		//Move into A
				movapd [edi+96],  xmm6;		//Move into A

				add edi, 112;
				add esi, 112;
				
			loop AL1;							// Loop if not done

	AZERO:
			mov ecx, cnt2;
			jecxz AZERO1;

			AL2:

				movapd xmm0, [edi];		//Load from A
				pmullw xmm0, [esi];		//Load from B & multiply A*B
				movapd [edi], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;

			loop AL2;	

	AZERO1:

			mov ecx, cnt3;
			jecxz AZERO2;

			mov eax, 0;

			AL3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				imul ax, [esi];
				mov [edi], ax;
				add esi, 2;
				add edi, 2;

			loop AL3;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if
}
















void sse_mul(void *A, void *B, void *C, int cnt)
{

	int cnt1;
	int cnt2;
	int cnt3;


	if(((int)A%16) || ((int)B%16) || ((int)C%16))
	{

		cnt1 = cnt / 32;
		cnt2 = (cnt - (32*cnt1)) / 8;
		cnt3 = (cnt - (32*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A, input1
			mov esi, B;	// Address of B, input2
			mov ebx, C; // Address of C, output1
			mov ecx, cnt1;	// Counter
			jecxz ZERO;

			L1:


				movupd xmm0, [edi];			//Load from A
				movupd xmm1, [edi+16];		//Load from A
				movupd xmm2, [edi+32];		//Load from A
				movupd xmm3, [edi+48];		//Load from A
				movupd xmm4, [esi];			//Load from B
				movupd xmm5, [esi+16];		//Load from B
				movupd xmm6, [esi+32];		//Load from B
				movupd xmm7, [esi+48];		//Load from B

				pmullw xmm0, xmm4;			//Multiply A*B
				pmullw xmm1, xmm5;			//Multiply A*B
				pmullw xmm2, xmm6;			//Multiply A*B
				pmullw xmm3, xmm7;			//Multiply A*B

				movupd [ebx],	  xmm0;		//Move into C
				movupd [ebx+16],  xmm1;		//Move into C
				movupd [ebx+32],  xmm2;		//Move into C
				movupd [ebx+48],  xmm3;		//Move into C

				add edi, 64;
				add esi, 64;
				add ebx, 64;
				
			loop L1;							// Loop if not done

	ZERO:
			mov ecx, cnt2;
			jecxz ZERO1;

			L2:

				movupd xmm0, [edi];		//Load from A
				movupd xmm1, [esi];		//Load from B
				pmullw  xmm0, xmm1;		//Multiply A*B
				movupd [ebx], xmm0;		//Move into C
				add edi, 16;
				add esi, 16;
				add ebx, 16;

			loop L2;	

	ZERO1:

			mov ecx, cnt3;
			jecxz ZERO2;

			mov eax, 0;

			L3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				imul ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop L3;

	ZERO2:

			EMMS;

		}//end __asm

	}
	else
	{

		cnt1 = cnt / 56;
		cnt2 = (cnt - (56*cnt1)) / 8;
		cnt3 = (cnt - (56*cnt1) - (8*cnt2));

		_asm 
		{
			//Set up for loop
			mov edi, A;	// Address of A, input1
			mov esi, B;	// Address of B, input2
			mov ebx, C; // Address of C, output1
			mov ecx, cnt1;	// Counter
			jecxz AZERO;

			AL1:

				movapd	xmm0, [edi];		//Load from A
				movapd	xmm1, [edi+16];		//Load from A
				movapd	xmm2, [edi+32];		//Load from A
				movapd	xmm3, [edi+48];		//Load from A
				movapd	xmm4, [edi+64];		//Load from A
				movapd	xmm5, [edi+80];		//Load from A
				movapd	xmm6, [edi+96];		//Load from A

				pmullw xmm0, [esi];			//Load from B & multiply A*B
				pmullw xmm1, [esi+16];		//Load from B & multiply A*B
				pmullw xmm2, [esi+32];		//Load from B & multiply A*B
				pmullw xmm3, [esi+48];		//Load from B & multiply A*B
				pmullw xmm4, [esi+64];		//Load from B & multiply A*B
				pmullw xmm5, [esi+80];		//Load from B & multiply A*B
				pmullw xmm6, [esi+96];		//Load from B & multiply A*B

				movapd [ebx],	  xmm0;		//Move into A
				movapd [ebx+16],  xmm1;		//Move into A
				movapd [ebx+32],  xmm2;		//Move into A
				movapd [ebx+48],  xmm3;		//Move into A
				movapd [ebx+64],  xmm4;		//Move into A
				movapd [ebx+80],  xmm5;		//Move into A
				movapd [ebx+96],  xmm6;		//Move into A

				add edi, 112;
				add esi, 112;
				add ebx, 112;
				
			loop AL1;							// Loop if not done

	AZERO:
			mov ecx, cnt2;
			jecxz AZERO1;

			AL2:

				movapd  xmm0, [edi];		//Load from A
				pmullw  xmm0, [esi];		//Load from B & add A+B
				movapd [ebx], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;
				add ebx, 16;

			loop AL2;	

	AZERO1:

			mov ecx, cnt3;
			jecxz AZERO2;

			mov eax, 0;

			AL3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				imul ax, [esi];
				mov [ebx], ax;
				add esi, 2;
				add edi, 2;
				add ebx, 2;

			loop AL3;

	AZERO2:

			EMMS;

		}//end __asm

	}//end if
}















void sse_sub(void *A, void *B, int cnt)
{

	//MOV EAX,1               ;request CPU feature flags
	//CPUID                   ;0Fh, 0A2h CPUID instruction
	//TEST EDX,4000000h       ;test bit 26 (SSE2)
	//JNZ >L18                ;SSE2 available

	int cnt1;
	int cnt2;
	int cnt3;

	if(((int)A%16) || ((int)B%16)) //unaligned version
	{

		cnt1 = cnt / 32;
		cnt2 = (cnt - (32*cnt1)) / 8;
		cnt3 = (cnt - (32*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A
			mov esi, B;	// Address of B
			mov ecx, cnt1;	// Counter
			jecxz ZERO;

			L1:

				movupd xmm0, [edi];			//Load from A
				movupd xmm1, [edi+16];		//Load from A
				movupd xmm2, [edi+32];		//Load from A
				movupd xmm3, [edi+48];		//Load from A

				movupd xmm4, [esi];			//Load from B
				movupd xmm5, [esi+16];		//Load from B
				movupd xmm6, [esi+32];		//Load from B
				movupd xmm7, [esi+48];		//Load from B

				psubw xmm0, xmm4;			//Multiply A*B
				psubw xmm1, xmm5;			//Multiply A*B
				psubw xmm2, xmm6;			//Multiply A*B
				psubw xmm3, xmm7;			//Multiply A*B

				movupd [edi],	  xmm0;		//Move into A
				movupd [edi+16],  xmm1;		//Move into A
				movupd [edi+32],  xmm2;		//Move into A
				movupd [edi+48],  xmm3;		//Move into A

				add edi, 64;
				add esi, 64;
				
			loop L1;							// Loop if not done

	ZERO:
			mov ecx, cnt2;
			jecxz ZERO1;

			L2:

				movupd xmm0, [edi];		//Load from A
				movupd xmm1, [esi];		//Load from B
				psubw xmm0, xmm1;		//Load from B & multiply A*B
				movupd [edi], xmm0;		//Move into A
				add edi, 16;
				add esi, 16;

			loop L2;	

	ZERO1:

			mov ecx, cnt3;
			jecxz ZERO2;

			mov eax, 0;

			L3:								//Really finish off loop with non SIMD instructions

				mov ax, [edi];
				sub ax, [esi];
				mov [edi], ax;
				add esi, 2;
				add edi, 2;

			loop L3;

	ZERO2:

			EMMS;

		}//end __asm

	}
	else
	{

		cnt1 = cnt / 56;
		cnt2 = (cnt - (56*cnt1)) / 8;
		cnt3 = (cnt - (56*cnt1) - (8*cnt2));

		_asm 
		{
		
			//Set up for loop
			mov edi, A;	// Address of A
			mov esi, B;	// Address of B
			mov ecx, cnt1;	// Counter
			jecxz AZERO;

			AL1:

				movapd xmm0, [edi];		//Load from A
				movapd xmm1, [edi+16];		//Load from A
				movapd xmm2, [edi+32];		//Load from A
				movapd xmm3, [edi+48];		//Load from A
				movapd xmm4, [edi+64];		//Load from A
				movapd xmm5, [edi+80];		//Load from A
				movapd xmm6, [edi+96];		//Load from A

				psubw xmm0, [esi];			//Load from B & multiply A*B
				psubw xmm1, [esi+16];		//Load from B & multiply A*B
				psubw xmm2, [esi+32];		//Load from B & multiply A*B
				psubw xmm3, [esi+48];		//Load from B & multiply A*B
				psubw xmm4, [esi+64];		//Load from B & multiply A*B
				psubw xmm5, [esi+80];		//Load from B & multiply A*B
				psubw xmm6, [esi+96];		//Load from B & multiply A*B

				movapd [edi],	  xmm0;		//Move into A
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -