📄 mot_est_sse.cpp
字号:
#include "mot_est.h"
//! another interpolate function, may be used in future
void InterpolateImage_sse(unsigned char *ipol_image, unsigned char *image, int width, int height, int RTYPE);
void interpolate_lum_sse(H263VencStatus *encoder)
{
int width = encoder->mv_outside_frame ? encoder->pels+32 : encoder->pels;
int height = encoder->mv_outside_frame ? encoder->lines+32: encoder->lines;
unsigned char *oo = encoder->mv_outside_frame ? (encoder->frame_buf[encoder->ref_index]).pLum-width*16-16 : (encoder->frame_buf[encoder->ref_index]).pLum;
unsigned char *ii = encoder->mv_outside_frame ? encoder->prev_ipol-width*64-32 : encoder->prev_ipol;
if (encoder->PTYPE == B_IMG)
{
oo = encoder->frame_buf[encoder->zero_index].pLum - width*16 - 16;
ii = encoder->next_ipol - width*64 - 32;
}
InterpolateImage_sse(ii, oo, width,height,0);
/* if(fout)
{
fprintf(fout,"width = %3d height = %3d\n",width,height);
}*/
/*
_asm
{
push esi
push edi
mov esi,oo
mov edi,ii
mov ebx,width
mov ecx,height
dec ecx
loop1:
mov edx,ebx
shr edx,3
xor eax,eax
loop2:
push ebx
add ebx,eax
movq mm1,[esi+eax]
movq mm2,[esi+ebx]
pop ebx
movq mm3,mm1
movq mm4,mm1
psrlq mm4,8
pavgb mm3,mm4
movq mm5,mm1
movq mm6,mm1
punpcklbw mm5,mm3
punpckhbw mm6,mm3
push eax
shl eax,1
movq [edi+eax],mm5
movq [edi+eax+8],mm6
pop eax
movq mm0,mm1
pavgb mm0,mm2
movq mm1,mm2
psrlq mm2,8
pavgb mm1,mm2
pavgb mm1,mm3
movq mm2,mm0
punpcklbw mm0,mm1
punpckhbw mm2,mm1
push ebx
push eax
shl ebx,1
shl eax,1
add ebx,eax
movq [edi+ebx],mm0
movq [edi+ebx+8],mm2
pop eax
pop ebx
push eax
push ebx
push ecx
push edx
xor edx,edx
xor cx,cx
add ebx,eax
mov dl,byte ptr[esi+eax +7]
mov cl,byte ptr[esi+eax +8]
sub ebx,eax
add dx,cx
mov cx,dx
inc dx
shr dx,1
shl eax,1
mov byte ptr[edi+eax+15],dl
push eax
shr eax,1
push ebx
xor dx,dx
add ebx,eax
mov dl,byte ptr[esi+ebx+7]
add cx,dx
mov dl,byte ptr[esi+ebx+8]
pop ebx
pop eax
add cx,dx
add cx,2
shr cx,2
shl ebx,1
add ebx,eax
mov byte ptr[edi+ebx+15],cl
pop edx
pop ecx
pop ebx
pop eax
add eax,8
dec edx
jnz loop2
add esi,width
push ebx
shl ebx,2
add edi,ebx
pop ebx
dec ecx
jnz loop1
// last line
mov esi,oo
mov edi,ii
mov ecx,width
mov eax,height
dec eax
mul ecx
add esi,eax
mov eax,height
shl eax,1
sub eax,2
shl ecx,1
mul ecx
add edi,eax
mov edx,edi
add edx,width
add edx,width
shr ecx,4
loop4:
movq mm1,[esi]
movq mm5,mm1
movq mm6,mm1
psrlq mm1,8
pavgb mm1,mm5
punpcklbw mm5,mm1
punpckhbw mm6,mm1
movq [edi],mm5
movq [edi+8],mm6
movq [edx],mm5
movq [edx+8],mm6
xor eax,eax
xor ebx,ebx
mov al, byte ptr[esi+7]
mov bl, byte ptr[esi+8]
add ax,bx
inc ax
shr ax,1
mov [edi+15],al
mov [edx+15],al
add esi,8
add edi,16
add edx,16
dec ecx
jnz loop4
pop edi
pop esi
emms
}
//计算每一行最后2列
for(j = 0; j < height - 1; j++)
{
*(ii+(width<<1)-2) = *(oo+width-1);
*(ii+(width<<1)-1) = *(oo+width-1);
*(ii+(width<<2)-2) = (*(oo+width-1)+*(oo+(width<<1)-1)+1)>>1;
*(ii+(width<<2)-1) = (*(oo+width-1)+*(oo+(width<<1)-1)+1)>>1;
ii += width<<2;
oo+= width;
}
//计算最后2行最后2列
*(ii+(width<<1)-2) = *(oo+width-1);
*(ii+(width<<1)-1) = *(oo+width-1);
*(ii+(width<<2)-2) = *(oo+width-1);
*(ii+(width<<2)-1) = *(oo+width-1);
*/
}
void InterpolateImage_sse(unsigned char *ipol_image, unsigned char *image, int width, int height, int RTYPE)
{
int i,j;
unsigned char *pucC,*tmppucC;
unsigned char *pucR,*tmppucR;
pucC = image;
pucR = ipol_image;
tmppucC=image+width;
tmppucR=ipol_image+2*width;
j = height - 1;
while (j--)
{
i= width / 8;
__asm
{
mov esi,pucC
mov eax,tmppucC
mov edi,pucR
mov edx,tmppucR
loop1:
movq mm0,[esi]
movq mm1,[esi+1]
movq mm2,[eax]
movq mm3,[eax+1]
movq mm4,mm0
movq mm5,mm1
pavgb mm1,mm0
movq mm7,mm0
punpcklbw mm0,mm1
punpckhbw mm7,mm1
movq [edi],mm0
movq [edi+8],mm7
pavgb mm4,mm2
pavgb mm3,mm5
pavgb mm3,mm4
movq mm7,mm4
punpcklbw mm4,mm3
punpckhbw mm7,mm3
movq [edx],mm4
movq [edx+8],mm7
add esi,8
add eax,8
add edi,16
add edx,16
dec i
jnz loop1
emms
}
pucC = pucC + width;
tmppucC = tmppucC + width;
pucR = pucR + 4 * width;
tmppucR = tmppucR + 4 * width;
}
i = width / 8;
__asm
{
mov esi,pucC
mov edi,pucR
mov edx,tmppucR
loop2:
movq mm0,[esi]
movq mm1,[esi+1]
pavgb mm1,mm0
movq mm2,mm0
punpcklbw mm0,mm1
punpckhbw mm2,mm1
movq [edi],mm0
movq [edi+8],mm2
movq [edx],mm0
movq [edx+8],mm2
add edi,16
add edx,16
add esi,8
dec i
jnz loop2
emms
}
i = 2 * height;
while(i--)
{
*(ipol_image + (i + 1) * (2 * width) - 1) = *(ipol_image + (i + 1) * (2 * width) - 2);
}
return;
}
int me_sad_a_sse(int lx1, int lx2, int x1, int y1, int x2, int y2,
unsigned char *P1, unsigned char *P2,
int sad_last, int blocksize)
{
int sad = 0;
unsigned char *p1 = P1 + lx1*y1 + x1;
unsigned char *p2 = P2 + lx2*y2 + x2;
if (16 == blocksize)
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, 16
mov edx, lx1
mov ebx, lx2
mov eax, 0
dist1__l1:
movq mm0, [esi] ;// load esi[0..7] into mm0
movq mm1, [edi] ;// load edi[0..7] into mm1
movq mm2, [esi+8] ;// load esi[8..15] into mm2
movq mm3, [edi+8] ;// load edi[8..15] into mm3
psadbw mm0, mm1;
psadbw mm2, mm3;
paddw mm0, mm2;
push ebx
movd ebx, mm0
add eax, ebx
pop ebx
cmp eax, sad_last ;// compare eax with distlim
jge dist1__l2 ;// terminate if eax >= distlim
add esi, edx ;// esi += edx
add edi, ebx ;// edi += edx
dec ecx ;// decrement ecx
jnz dist1__l1 ;// loop while not zero
dist1__l2:
mov [sad], eax ;// s = eax
emms ;// empty MMX state
}
}
else
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, 8
mov edx, lx1
mov ebx, lx2
mov eax, 0 ;
dist2__l1:
movq mm0, [esi] ;// load esi[0..7] into mm0
movq mm1, [edi] ;// load edi[0..7] into mm1
psadbw mm0, mm1
push ebx
movd ebx, mm0
add eax, ebx
pop ebx
cmp eax, sad_last ;// compare eax with distlim
jge dist2__l2 ;// terminate if eax >= distlim
add esi, edx ;// esi += edx
add edi, ebx ;// edi += edx
dec ecx ;// decrement ecx
jnz dist2__l1 ;// loop while not zero
dist2__l2:
mov [sad], eax ;// s = eax
emms ;// empty MMX state
}
}
if (sad >= sad_last)
{
sad = 99999;
}
return sad;
}
int me_sad_b_sse(int lx1, int lx2, int x1, int y1, int x2, int y2,
unsigned char *P1, unsigned char *P2,
int sad_last, int blocksize)
{
__int64 mask = 0x00ff00ff00ff00ff;
__int64 a = 0x0303030303030303;
__int64 b = 0x0404040404040404;
int sad = 0;
unsigned char *p1 = P1 + lx1*y1 + x1;
unsigned char *p2 = P2 + lx2*y2 + x2;
__asm
{
movq mm0, [a]
movq mm1, [b]
pavgb mm0, mm1
}
if (16 == blocksize)
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, 16
mov edx, lx1
mov ebx, lx2
shl ebx, 1
mov eax, 0
again1: movq mm6, [mask]
movq mm0, [esi]
movq mm2, [esi+8]
movq mm1, [edi]
movq mm3, [edi+8]
pand mm1, mm6
pand mm3, mm6
packuswb mm1, mm3
movq mm3, [edi+16]
movq mm7, [edi+24]
pand mm7, mm6
pand mm3, mm6
packuswb mm3, mm7
psadbw mm0, mm1
psadbw mm2, mm3
paddd mm0, mm2
push ebx
movd ebx, mm0
add eax, ebx
pop ebx
cmp eax, sad_last
jge finish1
add esi, edx ;// esi += edx
add edi, ebx ;// edi += edx
dec ecx ;// decrement ecx
jnz again1 ;// loop while not zero
finish1:
mov [sad], eax ;// s = eax
emms ;// empty MMX state
}
}
else
{
__asm
{
mov esi, p1
mov edi, p2
mov ecx, 8
mov edx, lx1
mov ebx, lx2
shl ebx, 1
mov eax, 0
again2: pxor mm7, mm7
movq mm6, [mask]
movq mm0, [esi]
movq mm1, [edi]
movq mm3, [edi+8]
pand mm1, mm6
pand mm3, mm6
packuswb mm1, mm3
psadbw mm0, mm1
push ebx
movd ebx, mm0
add eax, ebx
pop ebx
cmp eax, sad_last
jge finish2
add esi, edx ;// esi += edx
add edi, ebx ;// edi += edx
dec ecx ;// decrement ecx
jnz again2 ;// loop while not zero
finish2:
mov [sad], eax ;// s = eax
emms ;// empty MMX state
}
}
if (sad >= sad_last)
{
sad = 99999;
}
return sad;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -