📄 block_sse.cpp
字号:
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
pxor mm0,mm0
movq mm7,m0
mov ecx,8
loop3:
movq mm1,[eax]
movq mm2,[ebx]
pavgb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
loop loop3
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
unsigned src3 = (unsigned)(prev + (y_off + yh)*lx + x_off);
unsigned src4 = (unsigned)(prev + (y_off + yh)*lx + x_off +xh);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
mov edx,src3
mov esi,src4
pxor mm0,mm0
movq mm7,m1
mov ecx,8
loop4:
movq mm1,[eax]
movq mm2,mm1
punpcklbw mm1,mm0
punpckhbw mm2,mm0
movq mm3,[ebx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[edx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,2
psrlw mm2,2
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
add edx,lx
add esi,lx
loop loop4
}
}
_asm emms
}
//mmx
void pred_chrom_bid_mmx(int pix_x, int pix_y, unsigned char *prev, unsigned char *next, int lx,
int dxf, int dyf, int dxb, int dyb, unsigned char *pred_block)
{
int xint1, yint1, xint2, yint2;
int xh1, yh1, xh2, yh2;
int i, j;
int x_off, y_off;
unsigned char *tmp = pred_block;
__int64 m1 = 0x0001000100010001;
__int64 m2 = 0x0002000200020002;
xint1 = dxf>>1;
xh1 = dxf & 1;
yint1 = dyf>>1;
yh1 = dyf & 1;
if (!xh1 && !yh1) //!< xh and yh are both zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
unsigned char *src = prev + y_off*lx + x_off;
_asm
{
push esi
push edi
mov esi,src
mov edi,pred_block
mov ecx,8
loop1:
movq mm0,[esi]
movq [edi],mm0
add esi,lx
add edi,8
loop loop1
pop edi
pop esi
}
}
else if (!xh1 && yh1) //!< yh is not zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i+yh1)*lx + x_off+j) + 1)>>1;
tmp++;
}
}
*/
unsigned char *src1 = prev + y_off*lx + x_off;
unsigned char *src2 = prev + (y_off+yh1)*lx + x_off;
_asm
{
push esi
mov eax,src1
mov ebx,src2
mov edi,pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop2:
movq mm1,[eax]
movq mm3,[ebx]
movq mm2,mm1
movq mm4,mm3
punpcklbw mm1,mm7
punpckhbw mm2,mm7
punpcklbw mm3,mm7
punpckhbw mm4,mm7
paddusw mm1,mm3
paddusw mm1,mm0
paddusw mm2,mm4
paddusw mm2,mm0
psrlw mm1,1
psrlw mm2,1
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop2
pop edi
}
}
else if (xh1 && !yh1) //!< xh is not zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i)*lx + x_off+j+xh1) + 1)>>1;
tmp++;
}
}
*/
unsigned char *src1 = prev + y_off*lx + x_off;
unsigned char *src2 = prev + y_off*lx + x_off + xh1;
_asm
{
push esi
mov eax,src1
mov ebx,src2
mov edi,pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop3:
movq mm1,[eax]
movq mm3,[ebx]
movq mm2,mm1
movq mm4,mm3
punpcklbw mm1,mm7
punpckhbw mm2,mm7
punpcklbw mm3,mm7
punpckhbw mm4,mm7
paddusw mm1,mm3
paddusw mm1,mm0
paddusw mm2,mm4
paddusw mm2,mm0
psrlw mm1,1
psrlw mm2,1
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop3
pop edi
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i)*lx + x_off+j+xh1) +
*(prev + (y_off+i+yh1)*lx + x_off+j) +
*(prev + (y_off+i+yh1)*lx + x_off+j+xh1) +
2)>>2;
tmp++;
}
}
*/
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh1);
unsigned src3 = (unsigned)(prev + (y_off + yh1)*lx + x_off);
unsigned src4 = (unsigned)(prev + (y_off + yh1)*lx + x_off +xh1);
_asm
{
push esi
push edi
mov edi,pred_block
mov eax,src1
mov ebx,src2
mov edx,src3
mov esi,src4
pxor mm0,mm0
movq mm7,m2
mov ecx,8
loop4:
movq mm1,[eax]
movq mm2,mm1
punpcklbw mm1,mm0
punpckhbw mm2,mm0
movq mm3,[ebx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[edx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,2
psrlw mm2,2
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
add edx,lx
add esi,lx
loop loop4
pop edi
pop esi
}
}
tmp = pred_block;
xint2 = dxb>>1;
xh2 = dxb & 1;
yint2 = dyb>>1;
yh2 = dyb & 1;
if (!xh2 && !yh2) //!< xh and yh are both zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 + (*(next + (y_off+i)*lx + x_off+j))/2;
tmp++;
}
}
*/
unsigned char *src = next + y_off*lx + x_off;
_asm
{
push esi
push edi
mov esi,src
mov edi,pred_block
pxor mm7,mm7
mov ecx,8
loop5:
movq mm1,[esi]
movq mm3,[edi]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
psrlw mm3,1
psrlw mm4,1
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add esi,lx
add edi,8
loop loop5
pop edi
pop esi
}
}
else if (!xh2 && yh2) //!< yh is not zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i+yh2)*lx + x_off+j) + 1)/4;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + (y_off+yh2)*lx + x_off;
_asm
{
push edi
mov eax,dword ptr src1
mov ebx,dword ptr src2
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop6:
movq mm1,[edi]
movq mm3,[eax]
movq mm5,[ebx]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,2
psrlw mm4,2
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop6
pop edi
}
}
else if (xh2 && !yh2) //!< xh is not zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i)*lx + x_off+j+xh2) + 1)/4;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + y_off*lx + x_off + xh2;
_asm
{
push edi
mov eax,dword ptr src1
mov ebx,dword ptr src2
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop7:
movq mm1,[edi]
movq mm3,[eax]
movq mm5,[ebx]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,2
psrlw mm4,2
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop7
pop edi
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint2;
y_off = pix_y + yint2;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*tmp)/2 +
(*(next + (y_off+i)*lx + x_off+j) +
*(next + (y_off+i)*lx + x_off+j+xh2) +
*(next + (y_off+i+yh2)*lx + x_off+j) +
*(next + (y_off+i+yh2)*lx + x_off+j+xh2) +
2)/8;
tmp++;
}
}
*/
unsigned char *src1 = next + y_off*lx + x_off;
unsigned char *src2 = next + y_off*lx + x_off+xh2;
unsigned char *src3 = next + (y_off+yh2)*lx + x_off;
unsigned char *src4 = next + (y_off+yh2)*lx + x_off + xh2;
_asm
{
push esi
push edi
mov esi,src1
mov eax,src2
mov ebx,src3
mov edx,src4
mov edi,dword ptr pred_block
pxor mm7,mm7
movq mm0,m2
mov ecx,8
loop8:
movq mm1,[edi]
movq mm2,mm1
punpcklbw mm1,mm7
punpckhbw mm2,mm7
psrlw mm1,1
psrlw mm2,1
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm7
punpckhbw mm4,mm7
movq mm5,[eax]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
movq mm5,[ebx]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
movq mm5,[edx]
movq mm6,mm5
punpcklbw mm5,mm7
punpckhbw mm6,mm7
paddusw mm3,mm5
paddusw mm4,mm6
paddusw mm3,mm0
paddusw mm4,mm0
psrlw mm3,3
psrlw mm4,3
paddusw mm1,mm3
paddusw mm2,mm4
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add esi,lx
add eax,lx
add ebx,lx
add edx,lx
dec ecx
jnz loop8
pop edi
pop esi
}
}
_asm emms
}
//sse
void pred_chrom_bid_sse(int pix_x, int pix_y, unsigned char *prev, unsigned char *next, int lx,
int dxf, int dyf, int dxb, int dyb, unsigned char *pred_block)
{
int xint1, yint1, xint2, yint2;
int xh1, yh1, xh2, yh2;
int i, j;
int x_off, y_off;
unsigned char *tmp = pred_block;
__int64 m1 = 0x0001000100010001;
__int64 m2 = 0x0002000200020002;
xint1 = dxf>>1;
xh1 = dxf & 1;
yint1 = dyf>>1;
yh1 = dyf & 1;
if (!xh1 && !yh1) //!< xh and yh are both zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
unsigned char *src = prev + y_off*lx + x_off;
_asm
{
push esi
push edi
mov esi,src
mov edi,pred_block
mov ecx,8
loop1:
movq mm0,[esi]
movq [edi],mm0
add esi,lx
add edi,8
loop loop1
pop edi
pop esi
}
}
else if (!xh1 && yh1) //!< yh is not zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i+yh1)*lx + x_off+j) + 1)>>1;
tmp++;
}
}
*/
unsigned char *src1 = prev + y_off*lx + x_off;
unsigned char *src2 = prev + (y_off+yh1)*lx + x_off;
_asm
{
push esi
mov eax,src1
mov ebx,src2
mov edi,pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop2:
movq mm1,[eax]
movq mm3,[ebx]
pavgb mm1,mm3
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop2
pop edi
}
}
else if (xh1 && !yh1) //!< xh is not zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i)*lx + x_off+j+xh1) + 1)>>1;
tmp++;
}
}
*/
unsigned char *src1 = prev + y_off*lx + x_off;
unsigned char *src2 = prev + y_off*lx + x_off + xh1;
_asm
{
push esi
mov eax,src1
mov ebx,src2
mov edi,pred_block
pxor mm7,mm7
movq mm0,m1
mov ecx,8
loop3:
movq mm1,[eax]
movq mm3,[ebx]
pavgb mm1,mm3
movq [edi],mm1
add eax,lx
add ebx,lx
add edi,8
loop loop3
pop edi
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint1;
y_off = pix_y + yint1;
/* for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
*tmp = (*(prev + (y_off+i)*lx + x_off+j) +
*(prev + (y_off+i)*lx + x_off+j+xh1) +
*(prev + (y_off+i+yh1)*lx + x_off+j) +
*(prev + (y_off+i+yh1)*lx + x_off+j+xh1) +
2)>>2;
tmp++;
}
}
*/
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh1);
unsigned src3 = (unsigned)(prev + (y_off + yh1)*lx + x_off);
unsigned src4 = (unsigned)(prev + (y_off + yh1)*lx + x_off +xh1);
_asm
{
push esi
push edi
mov edi,pred_block
mov eax,src1
mov ebx,src2
mov edx,src3
mov esi,src4
pxor mm0,mm0
movq mm7,m2
mov ecx,8
loop4:
movq mm1,[eax]
movq mm2,mm1
punpcklbw mm1,mm0
punpckhbw mm2,mm0
movq mm3,[ebx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -