📄 block_sse.cpp
字号:
#include "block.h"
#include <stdio.h>
#include <stdlib.h>
static short Mc_sse[8][8] =
{
{4,5,5,5,5,5,5,4},
{5,5,5,5,5,5,5,5},
{5,5,6,6,6,6,5,5},
{5,5,6,6,6,6,5,5},
{5,5,6,6,6,6,5,5},
{5,5,6,6,6,6,5,5},
{5,5,5,5,5,5,5,5},
{4,5,5,5,5,5,5,4},
};
static short Mt_sse[8][8] =
{
{2,2,2,2,2,2,2,2},
{1,1,2,2,2,2,1,1},
{1,1,1,1,1,1,1,1},
{1,1,1,1,1,1,1,1},
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
};
static short Mb_sse[8][8] =
{
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
{0,0,0,0,0,0,0,0},
{1,1,1,1,1,1,1,1},
{1,1,1,1,1,1,1,1},
{1,1,2,2,2,2,1,1},
{2,2,2,2,2,2,2,2},
};
static short Mr_sse[8][8] =
{
{0,0,0,0,1,1,1,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,2,2},
{0,0,0,0,1,1,1,2},
};
static short Ml_sse[8][8] =
{
{2,1,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,2,1,1,0,0,0,0},
{2,1,1,1,0,0,0,0},
};
void pred_lum_sse(int pix_x, int pix_y, MotionVector *MV, unsigned char *ipol,
int lx, unsigned char *pred_block)
{
int pred_x, pred_y;
int h_space = 2;
int v_space = lx * 2;
unsigned char *tmp1;
unsigned char *tmp2 = pred_block;
__int64 mask = 0x00ff00ff00ff00ff;
pred_x = 2*pix_x + 2*MV->x + MV->x_half;
pred_y = 2*pix_y + 2*MV->y + MV->y_half;
tmp1 = ipol + lx*pred_y + pred_x;
__asm
{
mov esi, [tmp1]
mov edi, [tmp2]
mov ecx, 8
movq mm0, [mask]
looppl: movq mm1, [esi]
movq mm2, [esi+8]
pand mm1, mm0
pand mm2, mm0
packuswb mm1, mm2
movq [edi], mm1
add esi, [v_space]
add edi, 8
dec ecx
jnz looppl
emms
}
return;
}
void pred_lum_bid_sse(int pix_x, int pix_y, int dxf, int dyf, int dxb, int dyb,
unsigned char *prev_ipol, unsigned char *next_ipol, int lx,
unsigned char *pred_block)
{
int fw_pred_x, fw_pred_y;
int bw_pred_x, bw_pred_y;
int i, j, k, l;
// int h_space = 2;
int v_space = lx * 2;
unsigned char *tmp1;
unsigned char *tmp2;
unsigned char *tmp3 = pred_block;
__int64 mask = 0x00ff00ff00ff00ff;
fw_pred_x = 2*pix_x + dxf;
fw_pred_y = 2*pix_y + dyf;
bw_pred_x = 2*pix_x + dxb;
bw_pred_y = 2*pix_y + dyb;
tmp1 = prev_ipol + lx*fw_pred_y + fw_pred_x;
tmp2 = next_ipol + lx*bw_pred_y + bw_pred_x;
__asm
{
mov esi, tmp1
mov edi, tmp2
mov ebx, tmp3
mov edx, [v_space]
mov ecx, 8
movq mm7, [mask]
PRED_LUM_BID_SSE_LOOP:
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [edi]
movq mm3, [edi+8]
pand mm0, mm7
pand mm1, mm7
pand mm2, mm7
pand mm3, mm7
packuswb mm0, mm1
packuswb mm2, mm3
pavgb mm0, mm2
movq [ebx], mm0
add esi, edx
add edi, edx
add ebx, 8
dec ecx
jne PRED_LUM_BID_SSE_LOOP
emms
}
return;
}
void pred_obmc_sse(int x, int y, MotionVector *MV[6][MBR+1][MBC+2], int pic_width,
unsigned char *prev_ipol, int lx, int block, unsigned char *pred_block)
{
int r = y/16+1;
int c = x/16+1;
int use4mv_c = (MV[0][r][c]->Mode == MODE_INTER4V ? 1 : 0);
int use4mv_t = (MV[0][r-1][c]->Mode == MODE_INTER4V ? 1 : 0);
int use4mv_l = (MV[0][r][c-1]->Mode == MODE_INTER4V ? 1 : 0);
int use4mv_r = (MV[0][r][c+1]->Mode == MODE_INTER4V ? 1 : 0);
int intra_t, intra_l, intra_r;
int vecc, vect, vecb, vecl, vecr;
int x_curr, y_curr, x_top, y_top, x_btm, y_btm, x_left, y_left, x_right, y_right;
int pred_x_c, pred_x_t, pred_x_b, pred_x_l, pred_x_r;
int pred_y_c, pred_y_t, pred_y_b, pred_y_l, pred_y_r;
int val_c, val_t, val_b, val_l, val_r;
int m, n;
int tmpx, tmpy; //zj added , 2003.3.5
unsigned char *ppred_i_c, *ppred_i_t, *ppred_i_l, *ppred_i_r, *ppred_i_b;
short *ptmpMc=&Mc_sse[0][0], *ptmpMt=&Mt_sse[0][0], *ptmpMl=&Ml_sse[0][0],
*ptmpMr=&Mr_sse[0][0], *ptmpMb=&Mb_sse[0][0];
int looptimes=8;
__int64 mask=0x00ff00ff00ff00ff, roudnb=0x0004000400040004;
short tmp_block[8][8];
short *red_block = &tmp_block[0][0];
intra_t = ((MV[0][r-1][c]->Mode == MODE_INTRA || MV[0][r-1][c]->Mode == MODE_INTRA_Q) ? 1 : 0);
// intra_t = (MV[0][r-1][c]->Mode == MODE_INTRA_Q ? 1 : intra_t);
intra_l = ((MV[0][r][c-1]->Mode == MODE_INTRA || MV[0][r][c-1]->Mode == MODE_INTRA_Q) ? 1 : 0);
// intra_l = (MV[0][r][c-1]->Mode == MODE_INTRA_Q ? 1 : intra_l);
intra_r = ((MV[0][r][c+1]->Mode == MODE_INTRA || MV[0][r][c+1]->Mode == MODE_INTRA_Q) ? 1 : 0);
// intra_r = (MV[0][r][c+1]->Mode == MODE_INTRA_Q ? 1 : intra_r);
switch (block)
{
case 1:
vecc = use4mv_c ? 1 : 0;
x_curr = c;
y_curr = r;
vect = intra_t ? vecc : (use4mv_t ? 3 : 0);
x_top = c;
y_top = intra_t ? r : r-1;
vecb = use4mv_c ? 3 : 0;
x_btm = c;
y_btm = r;
vecl = intra_l ? vecc: (use4mv_l ? 2 : 0);
x_left = intra_l ? c : c-1;
y_left = r;
vecr = use4mv_c ? 2 : 0;
x_right = c;
y_right = r;
if (1 == c)
{
vecl = vecc;
x_left = 1;
}
if (1 == r)
{
vect = vecc;
y_top = 1;
}
break;
case 2:
vecc = use4mv_c ? 2 : 0;
x_curr = c;
y_curr = r;
vect = intra_t ? vecc : (use4mv_t ? 4 : 0);
x_top = c;
y_top = intra_t ? r : r-1;
vecb = use4mv_c ? 4 : 0;
x_btm = c;
y_btm = r;
vecl = use4mv_c ? 1 : 0;
x_left = c;
y_left = r;
vecr = intra_r ? vecc : (use4mv_r ? 1 : 0);
x_right = intra_r ? c : c+1;
y_right = r;
if (1 == r)
{
vect = vecc;
y_top = 1;
}
if (c == pic_width/16)
{
vecr = vecc;
x_right = c;
}
break;
case 3:
vecc = use4mv_c ? 3 : 0;
x_curr = c;
y_curr = r;
vect = use4mv_c ? 1 : 0;
x_top = c;
y_top = r;
vecb = vecc;
x_btm = c;
y_btm = r;
vecl = intra_l ? vecc : (use4mv_l ? 4 : 0);
x_left = intra_l ? c : c-1;
y_left = r;
vecr = use4mv_c ? 4 : 0;
x_right = c;
y_right = r;
if (1 == c)
{
vecl = vecc;
x_left = 1;
}
break;
case 4:
vecc = use4mv_c ? 4 : 0;
x_curr = c;
y_curr = r;
vect = use4mv_c ? 2 : 0;
x_top = c;
y_top = r;
vecb = vecc;
x_btm = c;
y_btm = r;
vecl = use4mv_c ? 3 : 0;
x_left = c;
y_left = r;
vecr = intra_r ? vecc : (use4mv_r ? 3 : 0);
x_right = intra_r ? c : c+1;
y_right = r;
if (c == pic_width/16)
{
vecr = vecc;
x_right = c;
}
break;
default:
printf("obmc error, block type not exist.\n");
exit(-1);
}
tmpx = 2*x + (((block-1)&1)<<4);
tmpy = 2*y + (((block-1)&2)<<3);
pred_x_c = tmpx + MV[vecc][y_curr][x_curr]->x*2 + MV[vecc][y_curr][x_curr]->x_half;
pred_x_t = tmpx + MV[vect][y_top][x_top]->x*2 + MV[vect][y_top][x_top]->x_half;
pred_x_b = tmpx + MV[vecb][y_btm][x_btm]->x*2 + MV[vecb][y_btm][x_btm]->x_half;
pred_x_l = tmpx + MV[vecl][y_left][x_left]->x*2 + MV[vecl][y_left][x_left]->x_half;
pred_x_r = tmpx + MV[vecr][y_right][x_right]->x*2 + MV[vecr][y_right][x_right]->x_half;
pred_y_c = tmpy + MV[vecc][y_curr][x_curr]->y*2 + MV[vecc][y_curr][x_curr]->y_half;
pred_y_t = tmpy + MV[vect][y_top][x_top]->y*2 + MV[vect][y_top][x_top]->y_half;
pred_y_b = tmpy + MV[vecb][y_btm][x_btm]->y*2 + MV[vecb][y_btm][x_btm]->y_half;
pred_y_l = tmpy + MV[vecl][y_left][x_left]->y*2 + MV[vecl][y_left][x_left]->y_half;
pred_y_r = tmpy + MV[vecr][y_right][x_right]->y*2 + MV[vecr][y_right][x_right]->y_half;
ppred_i_c = prev_ipol + pred_y_c * lx + pred_x_c;
ppred_i_r = prev_ipol + pred_y_r * lx + pred_x_r;
ppred_i_l = prev_ipol + pred_y_l * lx + pred_x_l;
ppred_i_b = prev_ipol + pred_y_b * lx + pred_x_b;
ppred_i_t = prev_ipol + pred_y_t * lx + pred_x_t;
lx *= 2;
__asm
{
movq mm0, [mask]
movq mm6, [roudnb]
PRED_OBMC_SSE_LOOP1:
mov esi,ppred_i_c
mov eax,ppred_i_t
mov ecx,ppred_i_b
mov edx,ppred_i_r
mov edi,ppred_i_l
movq mm1,[esi]
pand mm1,mm0
mov esi,ptmpMc
movq mm2,[esi]
pmullw mm1,mm2
movq mm2,[eax]
pand mm2,mm0
mov eax,ptmpMt
movq mm3,[eax]
pmullw mm2,mm3
paddw mm1,mm2
movq mm3,[ecx]
pand mm3,mm0
mov ecx,ptmpMb
movq mm4,[ecx]
pmullw mm3,mm4
paddw mm1,mm3
movq mm4,[edx]
pand mm4,mm0
mov edx,ptmpMr
movq mm2,[edx]
pmullw mm4,mm2
paddw mm1,mm4
movq mm5,[edi]
pand mm5,mm0
mov edi,ptmpMl
movq mm2,[edi]
pmullw mm5,mm2
paddw mm1,mm5
paddw mm1,mm6
psraw mm1,3
mov esi,pred_block
// movq mm7, mm1
movq [esi],mm1 //do four pixels
// add ppred_i_c,8
// add ppred_i_t,8
// add ppred_i_b,8
// add ppred_i_r,8
// add ppred_i_l,8
add ptmpMc,8
add ptmpMt,8
add ptmpMb,8
add ptmpMr,8
add ptmpMl,8
// add pred,4
mov esi,ppred_i_c
mov eax,ppred_i_t
mov ecx,ppred_i_b
mov edx,ppred_i_r
mov edi,ppred_i_l
movq mm1,[esi+8]
pand mm1,mm0
mov esi,ptmpMc
movq mm2,[esi]
pmullw mm1,mm2
movq mm2,[eax+8]
pand mm2,mm0
mov eax,ptmpMt
movq mm3,[eax]
pmullw mm2,mm3
paddw mm1,mm2
movq mm3,[ecx+8]
pand mm3,mm0
mov ecx,ptmpMb
movq mm4,[ecx]
pmullw mm3,mm4
paddw mm1,mm3
movq mm4,[edx+8]
pand mm4,mm0
mov edx,ptmpMr
movq mm2,[edx]
pmullw mm4,mm2
paddw mm1,mm4
movq mm5,[edi+8]
pand mm5,mm0
mov edi,ptmpMl
movq mm2,[edi]
pmullw mm5,mm2
paddw mm1,mm5
paddw mm1,mm6
psraw mm1,3
mov esi,pred_block
movq mm7, [esi]
packuswb mm7, mm1
movq [esi],mm7 //do four pixels
add pred_block, 8
mov eax, lx
add ppred_i_c,eax
add ppred_i_t,eax
add ppred_i_b,eax
add ppred_i_r,eax
add ppred_i_l,eax
add ptmpMc,8
add ptmpMt,8
add ptmpMb,8
add ptmpMr,8
add ptmpMl,8
dec looptimes
jnz PRED_OBMC_SSE_LOOP1
emms
}
// FILE *fptr;
// fptr = fopen("c:/pred_obmc_mmx.txt", "w");
// for(int i = 0; i<64; i++)
// {
// fprintf (fptr, "%4d ", *(pred_block+1));
// }
// fprintf (fptr, "\n");
// fclose(fptr);
// for (int i = 0; i<8; i++)
// {
// for(int j=0; j<8; j++)
// {
// *(pred_block + i*8 + j) = (unsigned char) *(red_block + i*8 +j);
// }
// }
/* for (n = 0; n < 8; n++)
{
for (m = 0; m < 8; m++)
{
val_c = *(prev_ipol + (pred_y_c + n*2)*lx + pred_x_c+m*2) * Mc[n][m];
val_t = *(prev_ipol + (pred_y_t + n*2)*lx + pred_x_t+m*2) * Mt[n][m];
val_b = *(prev_ipol + (pred_y_b + n*2)*lx + pred_x_b+m*2) * Mb[n][m];
val_l = *(prev_ipol + (pred_y_l + n*2)*lx + pred_x_l+m*2) * Ml[n][m];
val_r = *(prev_ipol + (pred_y_r + n*2)*lx + pred_x_r+m*2) * Mr[n][m];
*(pred_block + n*8 +m) = (val_c + val_t + val_b + val_l + val_r + 4)>>3;
}
}
*/
return;
}
//mmx
void pred_chrom_mmx(int pix_x, int pix_y, unsigned char *prev, int lx,
int dx, int dy, unsigned char *pred_block)
{
int xint, yint;
int xh, yh;
int i, j;
int x_off, y_off;
unsigned char *tmp = pred_block;
unsigned char m0[] = {1,0,1,0,1,0,1,0};
unsigned char m1[] = {2,0,2,0,2,0,2,0};
xint = dx>>1;
xh = dx & 1;
yint = dy>>1;
yh = dy & 1;
if (!xh && !yh) //!< xh and yh are both zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
_asm
{
mov esi,prev
mov edi,pred_block
mov eax,lx
mov ebx,y_off
mul ebx
add esi,x_off
add esi,eax
mov eax,lx
mov ecx,8
loop1:
movq mm0,[esi]
movq [edi],mm0
add esi,eax
add edi,8
loop loop1
}
}
else if (!xh && yh) //!< yh is not zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + (y_off+yh) * lx +x_off);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
pxor mm0,mm0
movq mm7,m0
mov ecx,8
loop2:
movq mm1,[eax]
movq mm3,[ebx]
movq mm2,mm1
movq mm4,mm3
punpcklbw mm1,mm0
punpckhbw mm2,mm0
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,1
psrlw mm2,1
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
loop loop2
}
}
else if (xh && !yh) //!< xh is not zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
pxor mm0,mm0
movq mm7,m0
mov ecx,8
loop3:
movq mm1,[eax]
movq mm3,[ebx]
movq mm2,mm1
movq mm4,mm3
punpcklbw mm1,mm0
punpckhbw mm2,mm0
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,1
psrlw mm2,1
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
loop loop3
}
}
else //! neither xh nor yh is zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + y_off * lx +x_off + xh);
unsigned src3 = (unsigned)(prev + (y_off + yh)*lx + x_off);
unsigned src4 = (unsigned)(prev + (y_off + yh)*lx + x_off +xh);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
mov edx,src3
mov esi,src4
pxor mm0,mm0
movq mm7,m1
mov ecx,8
loop4:
movq mm1,[eax]
movq mm2,mm1
punpcklbw mm1,mm0
punpckhbw mm2,mm0
movq mm3,[ebx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[edx]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
movq mm3,[esi]
movq mm4,mm3
punpcklbw mm3,mm0
punpckhbw mm4,mm0
paddusw mm1,mm3
paddusw mm2,mm4
paddusw mm1,mm7
paddusw mm2,mm7
psrlw mm1,2
psrlw mm2,2
packuswb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
add edx,lx
add esi,lx
loop loop4
}
}
_asm emms
}
//sse
void pred_chrom_sse(int pix_x, int pix_y, unsigned char *prev, int lx,
int dx, int dy, unsigned char *pred_block)
{
int xint, yint;
int xh, yh;
int i, j;
int x_off, y_off;
unsigned char *tmp = pred_block;
unsigned char m0[] = {1,0,1,0,1,0,1,0};
unsigned char m1[] = {2,0,2,0,2,0,2,0};
xint = dx>>1;
xh = dx & 1;
yint = dy>>1;
yh = dy & 1;
if (!xh && !yh) //!< xh and yh are both zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
_asm
{
mov esi,prev
mov edi,pred_block
mov eax,lx
mov ebx,y_off
mul ebx
add esi,x_off
add esi,eax
mov eax,lx
mov ecx,8
loop1:
movq mm0,[esi]
movq [edi],mm0
add esi,eax
add edi,8
loop loop1
}
}
else if (!xh && yh) //!< yh is not zero
{
x_off = pix_x + xint;
y_off = pix_y + yint;
unsigned src1 = (unsigned)(prev + y_off * lx +x_off);
unsigned src2 = (unsigned)(prev + (y_off+yh) * lx +x_off);
_asm
{
mov edi,pred_block
mov eax,src1
mov ebx,src2
pxor mm0,mm0
movq mm7,m0
mov ecx,8
loop2:
movq mm1,[eax]
movq mm2,[ebx]
pavgb mm1,mm2
movq [edi],mm1
add edi,8
add eax,lx
add ebx,lx
loop loop2
}
}
else if (xh && !yh) //!< xh is not zero
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -