📄 block_sse.cpp
字号:
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[edx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,2
psrlw mm2,2
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
add edx,lx
add esi,lx
loop loop4
pop edi
pop esi
}
}
tmp = pred_block;
xint2 = dxb>>1;
xh2 = dxb & 1;
yint2 = dyb>>1;
yh2 = dyb & 1;
if (!xh2 && !yh2) //!< xh and yh are both zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 + (*(next + (y_off+i)*lx + x_off+j))/2;
tmp++;
}
}
*/
unsigned char *src = next + y_off*lx + x_off;
_asm
{
push esi
push edi
mov esi,src
mov edi,pred_block
pxor mm7,mm7
mov ecx,8
loop5:
movq mm1,[esi]
movq mm3,[edi]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
psrlw mm3,1
psrlw mm4,1
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add esi,lx
add edi,8
loop loop5
pop edi
pop esi
}
}
else if (!xh2 && yh2) //!< yh is not zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i+yh2)*lx + x_off+j) + 1)/4;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + (y_off+yh2)*lx + x_off;
_asm
{
push edi
mov eax,dword ptr src1
mov ebx,dword ptr src2
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop6:
movq mm1,[edi]
movq mm3,[eax]
movq mm5,[ebx]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,2
psrlw mm4,2
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop6
pop edi
}
}
else if (xh2 && !yh2) //!< xh is not zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i)*lx + x_off+j+xh2) + 1)/4;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + y_off*lx + x_off + xh2;
_asm
{
push edi
mov eax,dword ptr src1
mov ebx,dword ptr src2
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop7:
movq mm1,[edi]
movq mm3,[eax]
movq mm5,[ebx]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,2
psrlw mm4,2
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop7
pop edi
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i)*lx + x_off+j+xh2) +
*(next + (y_off+i+yh2)*lx + x_off+j) +
*(next + (y_off+i+yh2)*lx + x_off+j+xh2) +
2)/8;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + y_off*lx + x_off+xh2;
unsigned char *src3 = next + (y_off+yh2)*lx + x_off;
unsigned char *src4 = next + (y_off+yh2)*lx + x_off + xh2;
_asm
{
push esi
push edi
mov esi,src1
mov eax,src2
mov ebx,src3
mov edx,src4
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m2
mov ecx,8
loop8:
movq mm1,[edi]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm5,[eax]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
movq mm5,[ebx]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
movq mm5,[edx]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,3
psrlw mm4,3
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add esi,lx
add eax,lx
add ebx,lx
add edx,lx
dec ecx
jnz loop8
pop edi
pop esi
}
}
_asm emms
}
void make_diff_mmx(unsigned char* curr_block, int pic_width, unsigned char *pred_block, INT16 *diff_block)
{
_asm
{
mov eax,dword ptr curr_block ;eax->curr_block
mov ebx,dword ptr pred_block ;ebx->pred_block
mov edi,dword ptr diff_block ;edi->diff_block
mov edx,pic_width
mov ecx,8
pxor mm7,mm7
loopecx:
movq mm0,[eax]
movq mm1,mm0
movq mm2,[ebx]
movq mm3,mm2
punpcklbw mm0,mm7
punpckhbw mm1,mm7
punpcklbw mm2,mm7
punpckhbw mm3,mm7
psubw mm0,mm2
psubw mm1,mm3
movq [edi],mm0
movq [edi+8],mm1
add eax,edx
add ebx,8
add edi,16
dec ecx
jnz loopecx
emms
}
}
void recon_pic_sse(unsigned char* recon_block, int pic_width, unsigned char *pred_block, INT16 *diff_block)
{
__asm
{
mov edi, [recon_block]
mov esi, [pred_block]
mov ebx, [diff_block]
mov edx, [pic_width]
mov ecx, 8
pxor mm7, mm7
loop_rp:
movq mm0, [esi]
movq mm1, [esi]
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movq mm2, [ebx]
movq mm3, [ebx+8]
paddw mm0, mm2
paddw mm1, mm3
packuswb mm0, mm1
movq [edi], mm0
add esi, 8
add edi, edx
add ebx, 16
dec ecx
jnz loop_rp
emms
}
}
int Quant_blk_I_sse (INT16 *curr_blk,INT16 *qun_blk, int QP)
{
int nonezero = 0;
_declspec(align(16)) float QPP2[] = {0.5/QP,0.5/QP,0.5/QP,0.5/QP};
_declspec(align(8))short mincoeff[] = {-127,-127,-127,-127};
_declspec(align(8))short maxcoeff[] = {127,127,127,127};
_declspec(align(8))short mask[] = {1,1,1,1};
_declspec(align(8))short tmp[64],*ptmp = tmp;
unsigned reg_mxcrold,reg_mxcrnew; //SIMD Status Register
if(!QP) return nonezero;
_asm
{
push esi
push edi
stmxcsr reg_mxcrold
mov eax,reg_mxcrold
and eax,0ffff9fffh
or eax,02000h
mov reg_mxcrnew,eax
ldmxcsr reg_mxcrnew
mov esi,curr_blk
mov edi,ptmp
mov ecx,16
pxor mm7,mm7
movq mm3,mincoeff
movq mm4,maxcoeff
movaps xmm7,QPP2
loop1:
movq mm0,[esi]
movq mm1,mm0
movq mm6,mm0
psraw mm0,15
pxor mm1,mm0
psubsw mm1,mm0
movq mm0,mm1
punpcklwd mm0,mm7
punpckhwd mm1,mm7
cvtpi2ps xmm0,mm0
cvtpi2ps xmm1,mm1
shufps xmm0,xmm1,044h
mulps xmm0,xmm7
cvtps2pi mm0,xmm0
shufps xmm0,xmm0,01eh ;00011110
cvtps2pi mm1,xmm0
packssdw mm0,mm1 ;mm0:level
movq mm1,mm7
pcmpgtw mm1,mm6
pcmpgtw mm6,mm7
pand mm6,mask
por mm1,mm6
pmullw mm0,mm1
movq mm1,mm3
pcmpgtw mm1,mm0
movq mm2,mm1
pand mm1,mm3
pandn mm2,mm0
por mm1,mm2
movq mm2,mm1
pcmpgtw mm2,mm4
movq mm0,mm2
pand mm2,mm4
pandn mm0,mm1
pxor mm0,mm2
movq [edi],mm0
add esi,8
add edi,8
dec ecx
jnz loop1
ldmxcsr reg_mxcrold
pop edi
pop esi
emms
}
qun_blk[0] = mmax(1,mmin(254, (curr_blk[0]+4)/8));
for (int i = 1; i < 64; i++)
{
qun_blk[MixZig[i]] = tmp[i];
if(tmp[i]) nonezero = 1;
}
return nonezero;
}
void DeQuant_blk_I_mmx (INT16 *curr_blk, INT16 *recn_blk, int QP)
{
int i;
_declspec(align(8)) short QPP2[] = {QP,QP,QP,QP};
_declspec(align(8)) unsigned mask[] = {0xffffffff,0xffffffff};
_declspec(align(8)) short inc1[] = {1,1,1,1};
_declspec(align(8)) INT16 tmp[64],*ptmp = tmp;
for (i = 0; i < 64; i++) tmp[i] = curr_blk[MixZig[i]];
_asm
{
push esi
push edi
mov esi,ptmp
mov edi,recn_blk
mov ecx,16
pxor mm7,mm7
movq mm6,QPP2 ;mm6->QP
loop1:
movq mm0,[esi]
movq mm1,mm0
movq mm5,mm0
psraw mm0,15
pxor mm1,mm0
psubsw mm1,mm0 ;mm1:abs(tmp)
psllw mm1,1 ;2*abs(tmp)
paddusw mm1,inc1 ;2*abs(tmp)+1
pmullw mm1,mm6
movq mm0,mm1 ;mm0:QP*(2*abs(tmp)+1)
movq mm1,mm6
pand mm1,inc1 ;QP%2
pcmpeqw mm1,inc1
pandn mm1,mask
paddsw mm0,mm1
movq mm1,mm0
movq mm2,mm7
pcmpgtw mm2,mm5
pmullw mm1,mm2
pandn mm2,mm0
por mm1,mm2 ;mm1:sign(curr_blk[MixZig[i]]) * recn_blk[i]
pcmpeqw mm5,mm7 ;是否为0
pandn mm5,mask
pand mm1,mm5
movq [edi],mm1
add esi,8
add edi,8
loop loop1
pop edi
pop esi
emms
}
recn_blk[0] = curr_blk[0]*8;
}
int Quant_blk_P_sse (INT16 *curr_blk,INT16 *qun_blk, int QP)
{
int nonezero = 0;
_declspec(align(16)) float QPP2[] = {0.5/QP,0.5/QP,0.5/QP,0.5/QP};
_declspec(align(8))short qpp[] = {QP>>1,QP>>1,QP>>1,QP>>1};
_declspec(align(8))short mincoeff[] = {-127,-127,-127,-127};
_declspec(align(8))short maxcoeff[] = {127,127,127,127};
_declspec(align(8))short mask[] = {1,1,1,1};
_declspec(align(8))short tmp[64],*ptmp = tmp;
unsigned reg_mxcrold,reg_mxcrnew; //SIMD Status Register
if(!QP) return nonezero;
_asm
{
push esi
push edi
stmxcsr reg_mxcrold
mov eax,reg_mxcrold
and eax,0ffff9fffh
or eax,02000h
mov reg_mxcrnew,eax
ldmxcsr reg_mxcrnew
mov esi,curr_blk
mov edi,ptmp
mov ecx,16
pxor mm7,mm7
movq mm3,mincoeff
movq mm4,maxcoeff
movaps xmm7,QPP2
loop1:
movq mm0,[esi]
movq mm1,mm0
movq mm6,mm0
psraw mm0,15
pxor mm1,mm0
psubsw mm1,mm0
psubusw mm1,qpp
movq mm0,mm1
punpcklwd mm0,mm7
punpckhwd mm1,mm7
cvtpi2ps xmm0,mm0
cvtpi2ps xmm1,mm1
shufps xmm0,xmm1,044h
mulps xmm0,xmm7
cvtps2pi mm0,xmm0
shufps xmm0,xmm0,01eh ;00011110
cvtps2pi mm1,xmm0
packssdw mm0,mm1 ;mm0:level
movq mm1,mm7
pcmpgtw mm1,mm6
pcmpgtw mm6,mm7
pand mm6,mask
por mm1,mm6
pmullw mm0,mm1
movq mm1,mm3
pcmpgtw mm1,mm0
movq mm2,mm1
pand mm1,mm3
pandn mm2,mm0
por mm1,mm2
movq mm2,mm1
pcmpgtw mm2,mm4
movq mm0,mm2
pand mm2,mm4
pandn mm0,mm1
pxor mm0,mm2
movq [edi],mm0
add esi,8
add edi,8
dec ecx
jnz loop1
ldmxcsr reg_mxcrold
pop edi
pop esi
emms
}
for (int i = 0; i < 64; i++)
{
qun_blk[MixZig[i]] = tmp[i];
if(tmp[i]) nonezero = 1;
}
return nonezero;
}
void DeQuant_blk_P_mmx (INT16 *curr_blk, INT16 *recn_blk, int QP)
{
int i;
_declspec(align(8)) short QPP2[] = {QP,QP,QP,QP};
_declspec(align(8)) unsigned mask[] = {0xffffffff,0xffffffff};
_declspec(align(8)) short inc1[] = {1,1,1,1};
_declspec(align(8)) INT16 tmp[64],*ptmp = tmp;
for (i = 0; i < 64; i++) tmp[i] = curr_blk[MixZig[i]];
_asm
{
push esi
push edi
mov esi,ptmp
mov edi,recn_blk
mov ecx,16
pxor mm7,mm7
movq mm6,QPP2 ;mm6->QP
loop1:
movq mm0,[esi]
movq mm1,mm0
movq mm5,mm0
psraw mm0,15
pxor mm1,mm0
psubsw mm1,mm0 ;mm1:abs(tmp)
psllw mm1,1 ;2*abs(tmp)
paddusw mm1,inc1 ;2*abs(tmp)+1
pmullw mm1,mm6
movq mm0,mm1 ;mm0:QP*(2*abs(tmp)+1)
movq mm1,mm6
pand mm1,inc1 ;QP%2
pcmpeqw mm1,inc1
pandn mm1,mask
paddsw mm0,mm1
movq mm1,mm0
movq mm2,mm7
pcmpgtw mm2,mm5
pmullw mm1,mm2
pandn mm2,mm0
por mm1,mm2 ;mm1:sign(curr_blk[MixZig[i]]) * recn_blk[i]
pcmpeqw mm5,mm7 ;是否为0
pandn mm5,mask
pand mm1,mm5
movq [edi],mm1
add esi,8
add edi,8
loop loop1
pop edi
pop esi
emms
}
}
void block_copy1_mmx( unsigned char *des_block, unsigned char *src_block, int lx_des)
{
unsigned char *tmp = src_block;
int i, j, k;
__asm
{
mov ecx, 8
mov esi, dword ptr [src_block]
mov edi, dword ptr [des_block]
loop001:movq mm0, qword ptr [esi]
movq qword ptr [edi], mm0
add esi, 8
add edi, dword ptr [lx_des]
dec ecx
jnz loop001
emms
}
}
void block_copy2_mmx(INT16 *des_block, unsigned char *src_block, int lx_src)
{
__asm
{
mov ecx, 8
mov esi, [src_block]
mov edi, [des_block]
pxor mm0, mm0
loop002:movq mm1, qword ptr [esi]
movq mm2, mm1
punpcklbw mm1, mm0
punpckhbw mm2, mm0
movq qword ptr [edi], mm1
movq qword ptr [edi+8], mm2
add esi, lx_src
add edi, 16
dec ecx
jnz loop002
emms
}
}
void block_copy3_mmx(unsigned char *des_block, INT16 *src_block, int lx_des)
{
// block_copy3_c(des_block, src_block, lx_des);
__asm
{
mov esi, [src_block]
mov edi, [des_block]
mov ecx, 8
mov edx, [lx_des]
loop_bc3:
movq mm0, [esi]
movq mm1, [esi+8]
packuswb mm0, mm1
movq [edi], mm0
add esi, 16
add edi, edx
dec ecx
jnz loop_bc3
emms
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -