⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 block_sse.cpp

📁 H.263的编码程序,加了CPU指令优化,VC版.
💻 CPP
📖 第 1 页 / 共 3 页
字号:
	{
		x_off = pix_x + xint;
		y_off = pix_y + yint;
		unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
		unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
		_asm
		{
			mov edi,pred_block
			mov eax,src1			
			mov ebx,src2
			pxor mm0,mm0
			movq mm7,m0
			mov ecx,8
loop3:
			movq mm1,[eax]
			movq mm2,[ebx]
			pavgb mm1,mm2

			movq [edi],mm1

			add edi,8
			add eax,lx
			add ebx,lx
			loop loop3

		}
	}
	else   //! neither xh nor yh is zero
	{
		x_off = pix_x + xint;
		y_off = pix_y + yint;
		unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
		unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
		unsigned src3 = (unsigned)(prev + (y_off + yh)*lx + x_off);
		unsigned src4 = (unsigned)(prev + (y_off + yh)*lx + x_off +xh);
		
		_asm
		{
			mov edi,pred_block
			mov eax,src1			
			mov ebx,src2
			mov edx,src3
			mov esi,src4

			pxor mm0,mm0
			movq mm7,m1
			mov ecx,8
loop4:
			movq mm1,[eax]
			movq mm2,mm1
			punpcklbw mm1,mm0
			punpckhbw mm2,mm0
			movq mm3,[ebx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[edx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[esi]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			paddusw mm1,mm7
			paddusw mm2,mm7
			psrlw mm1,2
			psrlw mm2,2
			packuswb mm1,mm2

			movq [edi],mm1

			add edi,8
			add eax,lx
			add ebx,lx
			add edx,lx
			add esi,lx
			loop loop4

		}
	}
	_asm emms
}


//mmx
void pred_chrom_bid_mmx(int pix_x, int pix_y, unsigned char *prev, 	unsigned char *next, int lx, 
				int dxf, int dyf, int dxb, int dyb, unsigned char *pred_block)
{
	int xint1, yint1, xint2, yint2;
	int xh1, yh1, xh2, yh2;
	int i, j;
	int x_off, y_off;
	unsigned char *tmp = pred_block;
	__int64  m1 = 0x0001000100010001;
	__int64  m2 = 0x0002000200020002;

    xint1 = dxf>>1;
    xh1 = dxf & 1;
    yint1 = dyf>>1;
    yh1 = dyf & 1;


	if (!xh1 && !yh1)  //!< xh and yh are both zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;

		unsigned char *src = prev + y_off*lx + x_off;
		_asm
		{
			push esi
			push edi
			mov esi,src
			mov edi,pred_block
			mov ecx,8
loop1:
			movq mm0,[esi]
			movq [edi],mm0

			add esi,lx
			add edi,8
			loop loop1
			pop edi
			pop esi
		}
	}
	else if (!xh1 && yh1) //!< yh is not zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					    *(prev + (y_off+i+yh1)*lx + x_off+j) + 1)>>1;
				tmp++;
			}
		}
*/
		unsigned char *src1 = prev + y_off*lx + x_off;
		unsigned char *src2 = prev + (y_off+yh1)*lx + x_off;
		_asm
		{
			push esi
			mov eax,src1
			mov ebx,src2
			mov edi,pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop2:
			movq mm1,[eax]
			movq mm3,[ebx]
			movq mm2,mm1
			movq mm4,mm3
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			paddusw mm1,mm3
			paddusw mm1,mm0
			paddusw mm2,mm4
			paddusw mm2,mm0
			psrlw mm1,1
			psrlw mm2,1
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop2
			pop edi
		}
	}
	else if (xh1 && !yh1)  //!< xh is not zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					    *(prev + (y_off+i)*lx + x_off+j+xh1) + 1)>>1;
				tmp++;
			}
		}
*/
		unsigned char *src1 = prev + y_off*lx + x_off;
		unsigned char *src2 = prev + y_off*lx + x_off + xh1;
		_asm
		{
			push esi
			mov eax,src1
			mov ebx,src2
			mov edi,pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop3:
			movq mm1,[eax]
			movq mm3,[ebx]
			movq mm2,mm1
			movq mm4,mm3
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			paddusw mm1,mm3
			paddusw mm1,mm0
			paddusw mm2,mm4
			paddusw mm2,mm0
			psrlw mm1,1
			psrlw mm2,1
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop3
			pop edi
		}
	}
	else   //! neither xh nor yh is zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					   *(prev + (y_off+i)*lx + x_off+j+xh1) +
						*(prev + (y_off+i+yh1)*lx + x_off+j) +
						*(prev + (y_off+i+yh1)*lx + x_off+j+xh1) + 
						2)>>2;
				tmp++;
			}
		}
*/
		unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
		unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh1);
		unsigned src3 = (unsigned)(prev + (y_off + yh1)*lx + x_off);
		unsigned src4 = (unsigned)(prev + (y_off + yh1)*lx + x_off +xh1);
		
		_asm
		{
			push esi
			push edi
			mov edi,pred_block
			mov eax,src1			
			mov ebx,src2
			mov edx,src3
			mov esi,src4

			pxor mm0,mm0
			movq mm7,m2
			mov ecx,8
loop4:
			movq mm1,[eax]
			movq mm2,mm1
			punpcklbw mm1,mm0
			punpckhbw mm2,mm0
			movq mm3,[ebx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[edx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[esi]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			paddusw mm1,mm7
			paddusw mm2,mm7
			psrlw mm1,2
			psrlw mm2,2
			packuswb mm1,mm2

			movq [edi],mm1

			add edi,8
			add eax,lx
			add ebx,lx
			add edx,lx
			add esi,lx
			loop loop4
			pop edi
			pop esi
		}
	}

	tmp = pred_block;
	xint2 = dxb>>1;
	xh2 = dxb & 1;
	yint2 = dyb>>1;
	yh2 = dyb & 1;

	if (!xh2 && !yh2)  //!< xh and yh are both zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + (*(next + (y_off+i)*lx + x_off+j))/2;
				tmp++;
			}
		}
*/
		unsigned char *src = next + y_off*lx + x_off;
		_asm
		{
			push esi
			push edi
			mov esi,src
			mov edi,pred_block
			pxor mm7,mm7
			mov  ecx,8
loop5:
			movq mm1,[esi]
			movq mm3,[edi]
			
			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			psrlw mm3,1
			psrlw mm4,1

			paddusw mm1,mm3
			paddusw mm2,mm4

			packuswb mm1,mm2

			movq [edi],mm1

			add esi,lx
			add edi,8
			loop loop5
			pop edi
			pop esi
		}
	}
	else if (!xh2 && yh2) //!< yh is not zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + 
					    (*(next + (y_off+i)*lx + x_off+j) +
					     *(next + (y_off+i+yh2)*lx + x_off+j) + 1)/4;				       			
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + (y_off+yh2)*lx + x_off;
		_asm
		{
			push edi
			mov eax,dword ptr src1
			mov ebx,dword ptr src2
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop6:
			movq mm1,[edi]
			movq mm3,[eax]
			movq mm5,[ebx]

			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7

			paddusw mm3,mm5
			paddusw mm4,mm6
			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,2
			psrlw mm4,2

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop6
			pop edi
		}
	}
	else if (xh2 && !yh2)  //!< xh is not zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + 
					   (*(next + (y_off+i)*lx + x_off+j) +
					    *(next + (y_off+i)*lx + x_off+j+xh2) + 1)/4;
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + y_off*lx + x_off + xh2;
		_asm
		{
			push edi
			mov eax,dword ptr src1
			mov ebx,dword ptr src2
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop7:
			movq mm1,[edi]
			movq mm3,[eax]
			movq mm5,[ebx]

			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7

			paddusw mm3,mm5
			paddusw mm4,mm6
			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,2
			psrlw mm4,2

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop7
			pop edi
		}

	}
	else   //! neither xh nor yh is zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 +
					   (*(next + (y_off+i)*lx + x_off+j) +
					    *(next + (y_off+i)*lx + x_off+j+xh2) +
					    *(next + (y_off+i+yh2)*lx + x_off+j) +
						*(next + (y_off+i+yh2)*lx + x_off+j+xh2) + 
						2)/8;
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + y_off*lx + x_off+xh2;
		unsigned char *src3 = next + (y_off+yh2)*lx + x_off;
		unsigned char *src4 = next + (y_off+yh2)*lx + x_off + xh2;
		_asm
		{
			push esi
			push edi

			mov esi,src1
			mov eax,src2
			mov ebx,src3
			mov edx,src4
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m2
			mov  ecx,8
loop8:
			movq mm1,[edi]
			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm3,[esi]
			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm5,[eax]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6
			movq mm5,[ebx]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6
			movq mm5,[edx]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6

			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,3
			psrlw mm4,3

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1
			
			add edi,8
			add esi,lx
			add eax,lx
			add ebx,lx
			add edx,lx
			dec ecx
			jnz loop8

			pop edi
			pop esi
		}
	}

	_asm emms
}


//sse
void pred_chrom_bid_sse(int pix_x, int pix_y, unsigned char *prev, 	unsigned char *next, int lx, 
				int dxf, int dyf, int dxb, int dyb, unsigned char *pred_block)
{
	int xint1, yint1, xint2, yint2;
	int xh1, yh1, xh2, yh2;
	int i, j;
	int x_off, y_off;
	unsigned char *tmp = pred_block;
	__int64  m1 = 0x0001000100010001;
	__int64  m2 = 0x0002000200020002;

    xint1 = dxf>>1;
    xh1 = dxf & 1;
    yint1 = dyf>>1;
    yh1 = dyf & 1;


	if (!xh1 && !yh1)  //!< xh and yh are both zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;

		unsigned char *src = prev + y_off*lx + x_off;
		_asm
		{
			push esi
			push edi
			mov esi,src
			mov edi,pred_block
			mov ecx,8
loop1:
			movq mm0,[esi]
			movq [edi],mm0

			add esi,lx
			add edi,8
			loop loop1
			pop edi
			pop esi
		}
	}
	else if (!xh1 && yh1) //!< yh is not zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					    *(prev + (y_off+i+yh1)*lx + x_off+j) + 1)>>1;
				tmp++;
			}
		}
*/
		unsigned char *src1 = prev + y_off*lx + x_off;
		unsigned char *src2 = prev + (y_off+yh1)*lx + x_off;
		_asm
		{
			push esi
			mov eax,src1
			mov ebx,src2
			mov edi,pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop2:
			movq mm1,[eax]
			movq mm3,[ebx]
			pavgb mm1,mm3
			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop2
			pop edi
		}
	}
	else if (xh1 && !yh1)  //!< xh is not zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					    *(prev + (y_off+i)*lx + x_off+j+xh1) + 1)>>1;
				tmp++;
			}
		}
*/
		unsigned char *src1 = prev + y_off*lx + x_off;
		unsigned char *src2 = prev + y_off*lx + x_off + xh1;
		_asm
		{
			push esi
			mov eax,src1
			mov ebx,src2
			mov edi,pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop3:
			movq mm1,[eax]
			movq mm3,[ebx]

			pavgb mm1,mm3
			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop3
			pop edi
		}
	}
	else   //! neither xh nor yh is zero
	{
		x_off = pix_x + xint1;
		y_off = pix_y + yint1;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
					   *(prev + (y_off+i)*lx + x_off+j+xh1) +
						*(prev + (y_off+i+yh1)*lx + x_off+j) +
						*(prev + (y_off+i+yh1)*lx + x_off+j+xh1) + 
						2)>>2;
				tmp++;
			}
		}
*/
		unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
		unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh1);
		unsigned src3 = (unsigned)(prev + (y_off + yh1)*lx + x_off);
		unsigned src4 = (unsigned)(prev + (y_off + yh1)*lx + x_off +xh1);
		
		_asm
		{
			push esi
			push edi
			mov edi,pred_block
			mov eax,src1			
			mov ebx,src2
			mov edx,src3
			mov esi,src4

			pxor mm0,mm0
			movq mm7,m2
			mov ecx,8
loop4:
			movq mm1,[eax]
			movq mm2,mm1
			punpcklbw mm1,mm0
			punpckhbw mm2,mm0
			movq mm3,[ebx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -