⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 block_sse.cpp

📁 H.263的编码程序,加了CPU指令优化,VC版.
💻 CPP
📖 第 1 页 / 共 3 页
字号:
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[edx]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			movq mm3,[esi]
			movq mm4,mm3
			punpcklbw mm3,mm0
			punpckhbw mm4,mm0
			paddusw mm1,mm3
			paddusw mm2,mm4
			paddusw mm1,mm7
			paddusw mm2,mm7
			psrlw mm1,2
			psrlw mm2,2
			packuswb mm1,mm2

			movq [edi],mm1

			add edi,8
			add eax,lx
			add ebx,lx
			add edx,lx
			add esi,lx
			loop loop4
			pop edi
			pop esi
		}
	}

	tmp = pred_block;
	xint2 = dxb>>1;
	xh2 = dxb & 1;
	yint2 = dyb>>1;
	yh2 = dyb & 1;

	if (!xh2 && !yh2)  //!< xh and yh are both zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + (*(next + (y_off+i)*lx + x_off+j))/2;
				tmp++;
			}
		}
*/
		unsigned char *src = next + y_off*lx + x_off;
		_asm
		{
			push esi
			push edi
			mov esi,src
			mov edi,pred_block
			pxor mm7,mm7
			mov  ecx,8
loop5:
			movq mm1,[esi]
			movq mm3,[edi]
			
			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			psrlw mm3,1
			psrlw mm4,1

			paddusw mm1,mm3
			paddusw mm2,mm4

			packuswb mm1,mm2

			movq [edi],mm1

			add esi,lx
			add edi,8
			loop loop5
			pop edi
			pop esi
		}
	}
	else if (!xh2 && yh2) //!< yh is not zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + 
					    (*(next + (y_off+i)*lx + x_off+j) +
					     *(next + (y_off+i+yh2)*lx + x_off+j) + 1)/4;				       			
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + (y_off+yh2)*lx + x_off;
		_asm
		{
			push edi
			mov eax,dword ptr src1
			mov ebx,dword ptr src2
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop6:
			movq mm1,[edi]
			movq mm3,[eax]
			movq mm5,[ebx]

			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7

			paddusw mm3,mm5
			paddusw mm4,mm6
			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,2
			psrlw mm4,2

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop6
			pop edi
		}
	}
	else if (xh2 && !yh2)  //!< xh is not zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 + 
					   (*(next + (y_off+i)*lx + x_off+j) +
					    *(next + (y_off+i)*lx + x_off+j+xh2) + 1)/4;
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + y_off*lx + x_off + xh2;
		_asm
		{
			push edi
			mov eax,dword ptr src1
			mov ebx,dword ptr src2
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m1
			mov  ecx,8
loop7:
			movq mm1,[edi]
			movq mm3,[eax]
			movq mm5,[ebx]

			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7

			paddusw mm3,mm5
			paddusw mm4,mm6
			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,2
			psrlw mm4,2

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1

			add eax,lx
			add ebx,lx
			add edi,8
			loop loop7
			pop edi
		}

	}
	else   //! neither xh nor yh is zero
	{
		x_off = pix_x + xint2;
		y_off = pix_y + yint2;
/*		for (i = 0; i < 8; i++)
		{
			for (j = 0; j < 8; j++)
			{
				*tmp = (*tmp)/2 +
					   (*(next + (y_off+i)*lx + x_off+j) +
					    *(next + (y_off+i)*lx + x_off+j+xh2) +
					    *(next + (y_off+i+yh2)*lx + x_off+j) +
						*(next + (y_off+i+yh2)*lx + x_off+j+xh2) + 
						2)/8;
				tmp++;
			}
		}
*/
		unsigned char *src1 = next + y_off*lx + x_off;
		unsigned char *src2 = next + y_off*lx + x_off+xh2;
		unsigned char *src3 = next + (y_off+yh2)*lx + x_off;
		unsigned char *src4 = next + (y_off+yh2)*lx + x_off + xh2;
		_asm
		{
			push esi
			push edi

			mov esi,src1
			mov eax,src2
			mov ebx,src3
			mov edx,src4
			mov edi,dword ptr pred_block
			pxor mm7,mm7
			movq mm0,m2
			mov  ecx,8
loop8:
			movq mm1,[edi]
			movq mm2,mm1
			punpcklbw mm1,mm7
			punpckhbw mm2,mm7
			psrlw mm1,1
			psrlw mm2,1

			movq mm3,[esi]
			movq mm4,mm3
			punpcklbw mm3,mm7
			punpckhbw mm4,mm7
			movq mm5,[eax]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6
			movq mm5,[ebx]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6
			movq mm5,[edx]
			movq mm6,mm5
			punpcklbw mm5,mm7
			punpckhbw mm6,mm7
			paddusw mm3,mm5
			paddusw mm4,mm6

			paddusw mm3,mm0
			paddusw mm4,mm0
			psrlw mm3,3
			psrlw mm4,3

			paddusw mm1,mm3
			paddusw mm2,mm4
			packuswb mm1,mm2

			movq [edi],mm1
			
			add edi,8
			add esi,lx
			add eax,lx
			add ebx,lx
			add edx,lx
			dec ecx
			jnz loop8

			pop edi
			pop esi
		}
	}

	_asm emms
}

void make_diff_mmx(unsigned char* curr_block, int pic_width, unsigned char *pred_block, INT16 *diff_block)
{
	_asm
	{
		mov eax,dword ptr curr_block	;eax->curr_block
		mov ebx,dword ptr pred_block	;ebx->pred_block
		mov edi,dword ptr diff_block	;edi->diff_block
		mov edx,pic_width
		mov ecx,8
		pxor mm7,mm7
loopecx:
		movq mm0,[eax]
		movq mm1,mm0
		movq mm2,[ebx]
		movq mm3,mm2
		punpcklbw mm0,mm7
		punpckhbw mm1,mm7
		punpcklbw mm2,mm7
		punpckhbw mm3,mm7
		psubw mm0,mm2
		psubw mm1,mm3
		movq [edi],mm0
		movq [edi+8],mm1

		add eax,edx
		add ebx,8
		add edi,16
		dec ecx
		jnz loopecx
		emms
	}
}

void recon_pic_sse(unsigned char* recon_block, int pic_width, unsigned char *pred_block, INT16 *diff_block)
{
	__asm
	{
		mov   edi, [recon_block]
		mov   esi, [pred_block]
		mov   ebx, [diff_block]
		mov   edx, [pic_width]
		mov   ecx, 8
		pxor  mm7, mm7
loop_rp:
		movq  mm0, [esi]
		movq  mm1, [esi]
		punpcklbw mm0, mm7
		punpckhbw mm1, mm7
		movq  mm2, [ebx]
		movq  mm3, [ebx+8]
		paddw  mm0, mm2
		paddw  mm1, mm3
		packuswb mm0, mm1
		movq  [edi], mm0
		add   esi, 8
		add   edi, edx
		add   ebx, 16
		dec   ecx
		jnz   loop_rp
		emms
	}
}
int Quant_blk_I_sse (INT16 *curr_blk,INT16 *qun_blk, int QP)
{
  int nonezero = 0;
  _declspec(align(16)) float QPP2[] = {0.5/QP,0.5/QP,0.5/QP,0.5/QP};
  _declspec(align(8))short mincoeff[] = {-127,-127,-127,-127};
  _declspec(align(8))short maxcoeff[] = {127,127,127,127};
  _declspec(align(8))short mask[] = {1,1,1,1};
  _declspec(align(8))short tmp[64],*ptmp = tmp;
  unsigned	reg_mxcrold,reg_mxcrnew;	//SIMD Status Register

  if(!QP) return nonezero;

  _asm
  {
	  push esi
	  push edi
	  stmxcsr reg_mxcrold
	  mov eax,reg_mxcrold
	  and eax,0ffff9fffh
	  or  eax,02000h
	  mov reg_mxcrnew,eax
	  ldmxcsr  reg_mxcrnew
	  mov esi,curr_blk
	  mov edi,ptmp
	  mov ecx,16
	  pxor mm7,mm7
	  movq mm3,mincoeff
	  movq mm4,maxcoeff
	  movaps xmm7,QPP2

loop1:
	  movq mm0,[esi]
	  movq mm1,mm0
	  movq mm6,mm0
	  psraw mm0,15
	  pxor mm1,mm0
	  psubsw mm1,mm0

	  movq mm0,mm1
	  punpcklwd mm0,mm7
	  punpckhwd mm1,mm7
	  cvtpi2ps xmm0,mm0
	  cvtpi2ps xmm1,mm1
	  shufps xmm0,xmm1,044h
	  mulps  xmm0,xmm7
	  cvtps2pi mm0,xmm0
	  shufps xmm0,xmm0,01eh	;00011110
	  cvtps2pi mm1,xmm0
	  packssdw mm0,mm1		;mm0:level
	  
	  movq mm1,mm7
	  pcmpgtw mm1,mm6
	  pcmpgtw mm6,mm7
	  pand    mm6,mask
	  por     mm1,mm6
	  pmullw  mm0,mm1
	  movq    mm1,mm3
	  pcmpgtw mm1,mm0
	  movq    mm2,mm1

	  pand    mm1,mm3
	  pandn   mm2,mm0
	  por     mm1,mm2

	  movq    mm2,mm1
	  pcmpgtw mm2,mm4
	  movq    mm0,mm2
	  pand    mm2,mm4
	  pandn   mm0,mm1
	  pxor    mm0,mm2

	  movq [edi],mm0

	  add esi,8
	  add edi,8
	  dec ecx
	  jnz loop1
	  ldmxcsr  reg_mxcrold
	  pop edi
	  pop esi
	  emms
  }

  qun_blk[0] = mmax(1,mmin(254, (curr_blk[0]+4)/8));
  for (int i = 1; i < 64; i++) 
  {
	  qun_blk[MixZig[i]] =  tmp[i];
	  if(tmp[i]) nonezero = 1; 
  }

  return nonezero;
}


void DeQuant_blk_I_mmx (INT16 *curr_blk, INT16 *recn_blk, int QP)
{
	int     i;
	_declspec(align(8)) short  QPP2[] = {QP,QP,QP,QP};
	_declspec(align(8)) unsigned mask[] = {0xffffffff,0xffffffff};
	_declspec(align(8)) short   inc1[] = {1,1,1,1};
	_declspec(align(8)) INT16 tmp[64],*ptmp = tmp;


	for (i = 0; i < 64; i++) tmp[i] = curr_blk[MixZig[i]];

	_asm
	{
		push esi
		push edi
		mov  esi,ptmp
		mov  edi,recn_blk
		mov  ecx,16
		pxor mm7,mm7
		movq mm6,QPP2		;mm6->QP

loop1:
		movq mm0,[esi]
		movq mm1,mm0
		movq mm5,mm0
		psraw mm0,15
		pxor mm1,mm0
		psubsw mm1,mm0	;mm1:abs(tmp)
		psllw  mm1,1	;2*abs(tmp)
		paddusw mm1,inc1	;2*abs(tmp)+1

		pmullw  mm1,mm6
		movq    mm0,mm1     ;mm0:QP*(2*abs(tmp)+1)

		movq mm1,mm6
		pand mm1,inc1		   ;QP%2
		pcmpeqw mm1,inc1
		pandn   mm1,mask
		paddsw  mm0,mm1
		movq  mm1,mm0

		movq  mm2,mm7
		pcmpgtw mm2,mm5
		pmullw mm1,mm2
		pandn mm2,mm0
		por mm1,mm2				;mm1:sign(curr_blk[MixZig[i]]) * recn_blk[i]

		pcmpeqw mm5,mm7			;是否为0
		pandn   mm5,mask
		pand    mm1,mm5

		movq [edi],mm1

		add  esi,8
		add  edi,8
		loop loop1
		pop  edi
		pop  esi
		emms
	}

    recn_blk[0] = curr_blk[0]*8;
}

int Quant_blk_P_sse (INT16 *curr_blk,INT16 *qun_blk, int QP)
{
  int nonezero = 0;
  _declspec(align(16)) float QPP2[] = {0.5/QP,0.5/QP,0.5/QP,0.5/QP};
  _declspec(align(8))short qpp[] = {QP>>1,QP>>1,QP>>1,QP>>1};
  _declspec(align(8))short mincoeff[] = {-127,-127,-127,-127};
  _declspec(align(8))short maxcoeff[] = {127,127,127,127};
  _declspec(align(8))short mask[] = {1,1,1,1};
  _declspec(align(8))short tmp[64],*ptmp = tmp;
  unsigned	reg_mxcrold,reg_mxcrnew;	//SIMD Status Register

  if(!QP) return nonezero;

  _asm
  {
	  push esi
	  push edi
	  stmxcsr reg_mxcrold
	  mov eax,reg_mxcrold
	  and eax,0ffff9fffh
	  or  eax,02000h
	  mov reg_mxcrnew,eax
	  ldmxcsr  reg_mxcrnew
	  mov esi,curr_blk
	  mov edi,ptmp
	  mov ecx,16
	  pxor mm7,mm7
	  movq mm3,mincoeff
	  movq mm4,maxcoeff
	  movaps xmm7,QPP2

loop1:
	  movq mm0,[esi]
	  movq mm1,mm0
	  movq mm6,mm0
	  psraw mm0,15
	  pxor mm1,mm0
	  psubsw mm1,mm0
	
	  psubusw mm1,qpp

	  movq mm0,mm1
	  punpcklwd mm0,mm7
	  punpckhwd mm1,mm7
	  cvtpi2ps xmm0,mm0
	  cvtpi2ps xmm1,mm1
	  shufps xmm0,xmm1,044h
	  mulps  xmm0,xmm7
	  cvtps2pi mm0,xmm0
	  shufps xmm0,xmm0,01eh	;00011110
	  cvtps2pi mm1,xmm0
	  packssdw mm0,mm1		;mm0:level
	  
	  movq mm1,mm7
	  pcmpgtw mm1,mm6
	  pcmpgtw mm6,mm7
	  pand    mm6,mask
	  por     mm1,mm6
	  pmullw  mm0,mm1
	  movq    mm1,mm3
	  pcmpgtw mm1,mm0
	  movq    mm2,mm1

	  pand    mm1,mm3
	  pandn   mm2,mm0
	  por     mm1,mm2

	  movq    mm2,mm1
	  pcmpgtw mm2,mm4
	  movq    mm0,mm2
	  pand    mm2,mm4
	  pandn   mm0,mm1
	  pxor    mm0,mm2

	  movq [edi],mm0

	  add esi,8
	  add edi,8
	  dec ecx
	  jnz loop1
	  ldmxcsr  reg_mxcrold
	  pop edi
	  pop esi
	  emms
  }

  for (int i = 0; i < 64; i++) 
  {
	  qun_blk[MixZig[i]] =  tmp[i];
	  if(tmp[i]) nonezero = 1; 
  }

  return nonezero;
}
void DeQuant_blk_P_mmx (INT16 *curr_blk, INT16 *recn_blk, int QP)
{
	int     i;
	_declspec(align(8)) short  QPP2[] = {QP,QP,QP,QP};
	_declspec(align(8)) unsigned mask[] = {0xffffffff,0xffffffff};
	_declspec(align(8)) short   inc1[] = {1,1,1,1};
	_declspec(align(8)) INT16 tmp[64],*ptmp = tmp;


	for (i = 0; i < 64; i++) tmp[i] = curr_blk[MixZig[i]];

	_asm
	{
		push esi
		push edi
		mov  esi,ptmp
		mov  edi,recn_blk
		mov  ecx,16
		pxor mm7,mm7
		movq mm6,QPP2		;mm6->QP

loop1:
		movq mm0,[esi]
		movq mm1,mm0
		movq mm5,mm0
		psraw mm0,15
		pxor mm1,mm0
		psubsw mm1,mm0	;mm1:abs(tmp)
		psllw  mm1,1	;2*abs(tmp)
		paddusw mm1,inc1	;2*abs(tmp)+1

		pmullw  mm1,mm6
		movq    mm0,mm1     ;mm0:QP*(2*abs(tmp)+1)

		movq mm1,mm6
		pand mm1,inc1		   ;QP%2
		pcmpeqw mm1,inc1
		pandn   mm1,mask
		paddsw  mm0,mm1
		movq  mm1,mm0

		movq  mm2,mm7
		pcmpgtw mm2,mm5
		pmullw mm1,mm2
		pandn mm2,mm0
		por mm1,mm2				;mm1:sign(curr_blk[MixZig[i]]) * recn_blk[i]

		pcmpeqw mm5,mm7			;是否为0
		pandn   mm5,mask
		pand    mm1,mm5

		movq [edi],mm1

		add  esi,8
		add  edi,8
		loop loop1
		pop  edi
		pop  esi
		emms
	}
}

void block_copy1_mmx( unsigned char *des_block, unsigned char *src_block, int lx_des)
{
	unsigned char *tmp = src_block;
	int i, j, k;

	__asm
	{
		mov  ecx, 8
		mov  esi, dword ptr [src_block]
		mov  edi, dword ptr [des_block]
loop001:movq mm0, qword ptr [esi]
		movq qword ptr [edi], mm0
		add  esi, 8
		add  edi, dword ptr  [lx_des]
		dec  ecx
		jnz  loop001
		emms
	}
}

void block_copy2_mmx(INT16 *des_block, unsigned char *src_block, int lx_src)
{
	__asm
	{
		mov  ecx, 8
		mov  esi, [src_block]
		mov  edi, [des_block]
		pxor mm0, mm0
loop002:movq mm1, qword ptr [esi]
		movq mm2, mm1
		punpcklbw mm1, mm0
		punpckhbw mm2, mm0
		movq qword ptr [edi],   mm1
		movq qword ptr [edi+8], mm2
		add  esi, lx_src
		add  edi, 16
		dec ecx
		jnz loop002
		emms

	}
}

void block_copy3_mmx(unsigned char *des_block, INT16 *src_block, int lx_des)
{
//	block_copy3_c(des_block, src_block, lx_des);
	__asm
	{
		mov   esi, [src_block]
		mov   edi, [des_block]
		mov   ecx,  8
		mov   edx, [lx_des]
loop_bc3:
		movq  mm0, [esi]
		movq  mm1, [esi+8]
		packuswb mm0, mm1
		movq  [edi], mm0
		add   esi, 16
		add   edi, edx
		dec   ecx
		jnz   loop_bc3
		emms
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -