📄 block.c
字号:
static _int64 ff=0xffffffffffff0000;
int imgcw=img->width_cr;
_int32 mulres[8][8],*pt4;
_int16*pt5;
//end
qp_per = QP_SCALE_CR[img->qp-MIN_QP]/6;
qp_rem = QP_SCALE_CR[img->qp-MIN_QP]%6;
q_bits = Q_BITS+qp_per;
if (img->type == INTRA_IMG)
qp_const=(1<<q_bits)/3; // intra
else
qp_const=(1<<q_bits)/6; // inter
pt5=quant_coef[qp_rem][0];
for (n2=0; n2 <= BLOCK_SIZE; n2 += BLOCK_SIZE)
{
for (n1=0; n1 <= BLOCK_SIZE; n1 += BLOCK_SIZE)
{
pointer=&(img->m7[n1][n2]);
pt4=&(mulres[n1][n2]);
_asm
{
mov eax,pointer
mov ebx,eax
mov edi,pt4
mov esi,pt5
//读入矩阵
movq mm4,[eax]
movq mm5,[eax+32]
movq mm6,[eax+64]
movq mm7,[eax+96]
//转秩
movq mm0,mm4
movq mm1,mm5
movq mm2,mm6
PUNPCKLWD mm4,mm5
PUNPCKLWD mm6,mm7
movq mm5,mm4
PUNPCKLDQ mm4,mm6
PUNPCKHDQ mm5,mm6
PUNPCKHWD mm0,mm1
PUNPCKHWD mm2,mm7
movq mm7,mm0
PUNPCKLDQ mm0,mm2
PUNPCKHDQ mm7,mm2
movq mm6,mm0
//计算第一遍
movq mm0,mm4
PADDW mm0,mm5
PADDW mm0,mm6
PADDW mm0,mm7
movq mm1,mm4
PSLLW mm1,1
PADDW mm1,mm5
PSUBW mm1,mm6
movq mm2,mm7
PSLLW mm2,1
PSUBW mm1,mm2
movq mm2,mm4
PADDW mm2,mm7
PSUBW mm2,mm5
PSUBW mm2,mm6
movq mm3,mm4
PSUBW mm3,mm7
PSLLW mm5,1
PSLLW mm6,1
PSUBW mm3,mm5
PADDW mm3,mm6
//转秩
movq mm4,mm0
movq mm5,mm1
movq mm6,mm2
movq mm7,mm3
PUNPCKLWD mm4,mm5
PUNPCKLWD mm6,mm7
movq mm5,mm4
PUNPCKLDQ mm4,mm6
PUNPCKHDQ mm5,mm6
PUNPCKHWD mm0,mm1
PUNPCKHWD mm2,mm7
movq mm7,mm0
PUNPCKLDQ mm0,mm2
PUNPCKHDQ mm7,mm2
movq mm6,mm0
//计算第二遍
movq mm0,mm4
PADDW mm0,mm5
PADDW mm0,mm6
PADDW mm0,mm7
movq mm1,mm4
PSLLW mm1,1
PADDW mm1,mm5
PSUBW mm1,mm6
movq mm2,mm7
PSLLW mm2,1
PSUBW mm1,mm2
movq mm2,mm4
PADDW mm2,mm7
PSUBW mm2,mm5
PSUBW mm2,mm6
movq mm3,mm4
PSUBW mm3,mm7
PSLLW mm5,1
PSLLW mm6,1
PSUBW mm3,mm5
PADDW mm3,mm6
movq [ebx],mm0
movq [ebx+32],mm1
movq [ebx+64],mm2
movq [ebx+96],mm3
//求绝对值
MOVQ MM4, MM0
PSRAW MM4, 15
PXOR MM0, MM4
PSUBW MM0, MM4
MOVQ MM5, MM1
PSRAW MM5, 15
PXOR MM1, MM5
PSUBW MM1, MM5
MOVQ MM6, MM2
PSRAW MM6, 15
PXOR MM2, MM6
PSUBW MM2, MM6
MOVQ MM7, MM3
PSRAW MM7, 15
PXOR MM3, MM7
PSUBW MM3, MM7
movq2dq xmm0,mm0
movq2dq xmm1,mm1
movq2dq xmm2,mm2
movq2dq xmm3,mm3
movq mm4,[esi]
movq2dq xmm4,mm4
MOVDQU xmm5,xmm0
PMULLW xmm0,xmm4
PMULHW xmm5,xmm4
PUNPCKLWD xmm0,xmm5
movq mm4,[esi+8]
movq2dq xmm4,mm4
MOVDQU xmm5,xmm1
PMULLW xmm1,xmm4
PMULHW xmm5,xmm4
PUNPCKLWD xmm1,xmm5
movq mm4,[esi+16]
movq2dq xmm4,mm4
MOVDQU xmm5,xmm2
PMULLW xmm2,xmm4
PMULHW xmm5,xmm4
PUNPCKLWD xmm2,xmm5
movq mm4,[esi+24]
movq2dq xmm4,mm4
MOVDQU xmm5,xmm3
PMULLW xmm3,xmm4
PMULHW xmm5,xmm4
PUNPCKLWD xmm3,xmm5
MOVDQU [edi],xmm0
MOVDQU [edi+32],xmm1
MOVDQU [edi+64],xmm2
MOVDQU [edi+96],xmm3
emms
}
}
}
/*for (n2=0; n2 <= BLOCK_SIZE; n2 += BLOCK_SIZE)
{
for (n1=0; n1 <= BLOCK_SIZE; n1 += BLOCK_SIZE)
{
// Horizontal transform.
for (j=0; j < BLOCK_SIZE; j++)
{
mb_y=n2+j;
for (i=0; i < 2; i++)
{
i1=3-i;
m5[i]=img->m7[i+n1][mb_y]+img->m7[i1+n1][mb_y];
m5[i1]=img->m7[i+n1][mb_y]-img->m7[i1+n1][mb_y];
}
img->m7[n1][mb_y] =(m5[0]+m5[1]);
img->m7[n1+2][mb_y]=(m5[0]-m5[1]);
img->m7[n1+1][mb_y]=m5[3]*2+m5[2];
img->m7[n1+3][mb_y]=m5[3]-m5[2]*2;
}
// Vertical transform.
for (i=0; i < BLOCK_SIZE; i++)
{
j1=n1+i;
for (j=0; j < 2; j++)
{
j2=3-j;
m5[j]=img->m7[j1][n2+j]+img->m7[j1][n2+j2];
m5[j2]=img->m7[j1][n2+j]-img->m7[j1][n2+j2];
}
img->m7[j1][n2+0]=(m5[0]+m5[1]);
img->m7[j1][n2+2]=(m5[0]-m5[1]);
img->m7[j1][n2+1]=m5[3]*2+m5[2];
img->m7[j1][n2+3]=m5[3]-m5[2]*2;
}
}
}*/
// 2X2 transform of DC coeffs.
m1[0]=(img->m7[0][0]+img->m7[4][0]+img->m7[0][4]+img->m7[4][4]);
m1[1]=(img->m7[0][0]-img->m7[4][0]+img->m7[0][4]-img->m7[4][4]);
m1[2]=(img->m7[0][0]+img->m7[4][0]-img->m7[0][4]-img->m7[4][4]);
m1[3]=(img->m7[0][0]-img->m7[4][0]-img->m7[0][4]+img->m7[4][4]);
// Quant of chroma 2X2 coeffs.
run=-1;
scan_pos=0;
for (coeff_ctr=0; coeff_ctr < 4; coeff_ctr++)
{
run++;
ilev=0;
level =(abs(m1[coeff_ctr]) * quant_coef[qp_rem][0][0] + 2*qp_const) >> (q_bits+1);
if (level != 0)
{
currMB->cbp_blk |= 0xf0000 << (uv << 2) ; // if one of the 2x2-DC levels is != 0 set the
cr_cbp=max(1,cr_cbp); // coded-bit all 4 4x4 blocks (bit 16-19 or 20-23)
DCcoded = 1 ;
DCLevel[scan_pos] = sign(level ,m1[coeff_ctr]);
DCRun [scan_pos] = run;
scan_pos++;
run=-1;
ilev=level*dequant_coef[qp_rem][0][0]<<qp_per;
m1[coeff_ctr]=sign(ilev,m1[coeff_ctr]);
}
else
m1[coeff_ctr]=0;
}
DCLevel[scan_pos] = 0;
// Invers transform of 2x2 DC levels
img->m7[0][0]=(m1[0]+m1[1]+m1[2]+m1[3])>>1;
img->m7[4][0]=(m1[0]-m1[1]+m1[2]-m1[3])>>1;
img->m7[0][4]=(m1[0]+m1[1]-m1[2]-m1[3])>>1;
img->m7[4][4]=(m1[0]-m1[1]-m1[2]+m1[3])>>1;
// Quant of chroma AC-coeffs.
coeff_cost=0;
cr_cbp_tmp=0;
for (n2=0; n2 <= BLOCK_SIZE; n2 += BLOCK_SIZE)
{
for (n1=0; n1 <= BLOCK_SIZE; n1 += BLOCK_SIZE)
{
b4 = 2*(n2/4) + (n1/4);
ACLevel = img->cofAC[uv+4][b4][0];
ACRun = img->cofAC[uv+4][b4][1];
run=-1;
scan_pos=0;
for (coeff_ctr=1; coeff_ctr < 16; coeff_ctr++)// start change rd_quant
{
i=SNGL_SCAN[coeff_ctr][0];
j=SNGL_SCAN[coeff_ctr][1];
++run;
ilev=0;
//level=(abs(img->m7[n1+i][n2+j])*quant_coef[qp_rem][i][j]+qp_const)>>q_bits;
level = (mulres[n1+i][n2+j] + qp_const) >> q_bits;
if (level != 0)
{
currMB->cbp_blk |= 1 << (16 + (uv << 2) + ((n2 >> 1) + (n1 >> 2))) ;
if (level > 1)
coeff_cost += MAX_VALUE; // set high cost, shall not be discarded
else
coeff_cost += COEFF_COST[run];
cr_cbp_tmp=2;
ACLevel[scan_pos] = sign(level,img->m7[n1+i][n2+j]);
ACRun [scan_pos] = run;
++scan_pos;
run=-1;
ilev=level*dequant_coef[qp_rem][i][j]<<qp_per;
img->m7[n1+i][n2+j]=sign(ilev,img->m7[n1+i][n2+j]); // for use in IDCT
}
else
img->m7[n1+i][n2+j]=0; // for use in IDCT
}
ACLevel[scan_pos] = 0;
}
}
// * reset chroma coeffs
if(coeff_cost < _CHROMA_COEFF_COST_)
{
cr_cbp_tmp = 0 ;
for (n2=0; n2 <= BLOCK_SIZE; n2 += BLOCK_SIZE)
{
for (n1=0; n1 <= BLOCK_SIZE; n1 += BLOCK_SIZE)
{
b4 = 2*(n2/4) + (n1/4);
ACLevel = img->cofAC[uv+4][b4][0];
ACRun = img->cofAC[uv+4][b4][1];
if( DCcoded == 0) currMB->cbp_blk &= ~(0xf0000 << (uv << 2)); // if no chroma DC's: then reset coded-bits of this chroma subblock
nn0 = (n1>>2) + (uv<<1);
nn1 = 4 + (n2>>2) ;
ACLevel[0] = 0;
for (coeff_ctr=1; coeff_ctr < 16; coeff_ctr++)// ac coeff
{
i=SNGL_SCAN[coeff_ctr][0];
j=SNGL_SCAN[coeff_ctr][1];
img->m7[n1+i][n2+j]=0;
ACLevel[coeff_ctr] = 0;
}
}
}
}
if(cr_cbp_tmp==2)
cr_cbp = 2;
// IDCT.
for (n2=0; n2 <= BLOCK_SIZE; n2 += BLOCK_SIZE)
{
for (n1=0; n1 <= BLOCK_SIZE; n1 += BLOCK_SIZE)
{
pointer=&(img->m7[n1][n2]);
pt1=&(img->mpr[n1][n2]);
pt3=&(imgUV[uv][img->pix_c_y+n2][img->pix_c_x+n1]);
_asm
{
mov eax,pointer
mov ebx,pt2
//读入矩阵
movq mm4,[eax]
movq mm5,[eax+32]
movq mm6,[eax+64]
movq mm7,[eax+96]
//转秩
movq mm0,mm4
movq mm1,mm5
movq mm2,mm6
PUNPCKLWD mm4,mm5
PUNPCKLWD mm6,mm7
movq mm5,mm4
PUNPCKLDQ mm4,mm6
PUNPCKHDQ mm5,mm6
PUNPCKHWD mm0,mm1
PUNPCKHWD mm2,mm7
movq mm7,mm0
PUNPCKLDQ mm0,mm2
PUNPCKHDQ mm7,mm2
movq mm6,mm0
//计算第一遍
movq mm0,mm4
PADDW mm0,mm5
PADDW mm0,mm6
movq mm3,mm7
PSRAW mm3,1
PADDW mm0,mm3
movq mm1,mm4
movq mm3,mm5
PSRAW mm3,1
PADDW mm1,mm3
PSUBW mm1,mm6
PSUBW mm1,mm7
movq mm2,mm4
PADDW mm2,mm7
movq mm3,mm5
PSRAW mm3,1
PSUBW mm2,mm3
PSUBW mm2,mm6
movq mm3,mm4
PADDW mm3,mm6
PSUBW mm3,mm5
PSRAW mm7,1
PSUBW mm3,mm7
//转秩
movq mm4,mm0
movq mm5,mm1
movq mm6,mm2
movq mm7,mm3
PUNPCKLWD mm4,mm5
PUNPCKLWD mm6,mm7
movq mm5,mm4
PUNPCKLDQ mm4,mm6
PUNPCKHDQ mm5,mm6
PUNPCKHWD mm0,mm1
PUNPCKHWD mm2,mm7
movq mm7,mm0
PUNPCKLDQ mm0,mm2
PUNPCKHDQ mm7,mm2
movq mm6,mm0
//计算第二遍
movq mm0,mm4
PADDW mm0,mm5
PADDW mm0,mm6
movq mm3,mm7
PSRAW mm3,1
PADDW mm0,mm3
movq mm1,mm4
movq mm3,mm5
PSRAW mm3,1
PADDW mm1,mm3
PSUBW mm1,mm6
PSUBW mm1,mm7
movq mm2,mm4
PADDW mm2,mm7
movq mm3,mm5
PSRAW mm3,1
PSUBW mm2,mm3
PSUBW mm2,mm6
movq mm3,mm4
PADDW mm3,mm6
PSUBW mm3,mm5
PSRAW mm7,1
PSUBW mm3,mm7
movq [ebx],mm0
movq [ebx+8],mm1
movq [ebx+16],mm2
movq [ebx+24],mm3
mov eax,pt1
mov ebx,pt2
mov ecx,ebx
mov dx,4
movq mm5,dq
loop1_2:
pxor mm4,mm4
movq mm6,[eax]
movq mm7,[ebx]
movq mm0,mm6
PUNPCKLWD mm0,mm4
pslld mm0,16
psrad mm0,16
movq mm1,mm7
PUNPCKLWD mm1,mm4
pslld mm1,16
psrad mm1,16
movq mm2,mm6
PUNPCKHWD mm2,mm4
pslld mm2,16
psrad mm2,16
movq mm3,mm7
PUNPCKHWD mm3,mm4
pslld mm3,16
psrad mm3,16
pxor mm7,mm7
PSLLD mm0,6
paddd mm0,mm1
paddd mm0,mm5
psrad mm0,6
PSLLD mm2,6
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -