📄 nic_postprocess.cpp
字号:
v[2*stride + 4] =
v[3*stride + 4] = 255;
}
#endif
}
}
else /* use default mode */
{
deblock_horiz_default_filter(v, stride, QP);
#ifdef SHOWDECISIONS_H
if (!chromaFlag)
{
v[0*stride + 4] =
v[1*stride + 4] =
v[2*stride + 4] =
v[3*stride + 4] = 0;
}
#endif
}
}
}
/* decide DC mode or default mode in assembler */
static inline int deblock_vert_useDC(uint8_t *v, stride_t stride, int DEBLOCK_VERT_USEDC_THR)
{
__m64 mask = _mm_set1_pi8(/*0xfe*/-2);
int32_t mm_data1;
uint64_t *pmm1;
int eq_cnt, useDC;
#ifdef PP_SELF_CHECK
int useDC2, i, j;
#endif
#ifdef PP_SELF_CHECK
/* C-code version for testing */
eq_cnt = 0;
for (j=1; j<8; j++)
{
for (i=0; i<8; i++)
{
if (abs(v[j*stride+i] - v[(j+1)*stride+i]) <= 1) eq_cnt++;
}
}
useDC2 = (eq_cnt > DEBLOCK_VERT_USEDC_THR);
#endif
/* starting pointer is at v[stride] == v1 in mpeg4 notation */
pmm1 = (uint64_t *)(&(v[stride]));
/* first load some constants into mm4, mm6, mm7 */
//push eax
unsigned char *eax=(unsigned char*)pmm1;
__m64 mm6,mm7,mm2,mm4,mm3;
movq (mm6, mask); /*mm6 = 0xfefefefefefefefe */
pxor (mm7, mm7); /*mm7 = 0x0000000000000000 */
movq (mm2, eax); /* mm2 = *p_data */
pxor (mm4, mm4); /*mm4 = 0x0000000000000000 */
eax+=stride; /* p_data += stride */
movq (mm3, mm2); /* mm3 = *p_data */
__m64 mm0,mm1;
movq (mm2, eax); /* mm2 = *p_data */
movq (mm0, mm3); /* mm0 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm0); /* mm1 = mm0 */
psubusb (mm0, mm2); /* mm0 -= mm2 */
eax+=stride; /* p_data += stride */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm0, mm2); /* mm0 |= mm2 */
pand (mm0, mm6); /* mm0 &= 0xfefefefefefefefe */
pcmpeqb (mm0, mm4); /* is mm0 == 0 ? */
movq (mm2, eax); /* mm2 = *p_data */
psubb (mm7, mm0); /* mm7 has running total of eqcnts */
__m64 mm5;
movq (mm5, mm3); /* mm5 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm5); /* mm1 = mm5 */
psubusb (mm5, mm2); /* mm5 -= mm2 */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm5, mm2); /* mm5 |= mm2 */
eax+=stride; /* p_data += stride */
pand (mm5, mm6); /* mm5 &= 0xfefefefefefefefe */
pcmpeqb (mm5, mm4); /* is mm0 == 0 ? */
psubb (mm7, mm5); /* mm7 has running total of eqcnts */
movq (mm2, eax); /* mm2 = *p_data */
movq (mm0, mm3); /* mm0 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm0); /* mm1 = mm0 */
psubusb (mm0, mm2); /* mm0 -= mm2 */
eax+=stride; /* p_data += stride */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm0, mm2); /* mm0 |= mm2 */
pand (mm0, mm6); /* mm0 &= 0xfefefefefefefefe */
pcmpeqb (mm0, mm4); /* is mm0 == 0 ? */
movq (mm2, eax); /* mm2 = *p_data */
psubb (mm7, mm0); /* mm7 has running total of eqcnts */
movq (mm5, mm3); /* mm5 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm5); /* mm1 = mm5 */
psubusb (mm5, mm2); /* mm5 -= mm2 */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm5, mm2); /* mm5 |= mm2 */
eax+=stride; /* p_data += stride */
pand (mm5, mm6); /* mm5 &= 0xfefefefefefefefe */
pcmpeqb (mm5, mm4); /* is mm0 == 0 ? */
psubb (mm7, mm5); /* mm7 has running total of eqcnts */
movq (mm2, eax); /* mm2 = *p_data */
movq (mm0, mm3); /* mm0 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm0); /* mm1 = mm0 */
psubusb (mm0, mm2); /* mm0 -= mm2 */
eax+=stride; /* p_data += stride */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm0, mm2); /* mm0 |= mm2 */
pand (mm0, mm6); /* mm0 &= 0xfefefefefefefefe */
pcmpeqb (mm0, mm4); /* is mm0 == 0 ? */
movq (mm2, eax); /* mm2 = *p_data */
psubb (mm7, mm0); /* mm7 has running total of eqcnts */
movq (mm5, mm3); /* mm5 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm5); /* mm1 = mm5 */
psubusb (mm5, mm2); /* mm5 -= mm2 */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm5, mm2); /* mm5 |= mm2 */
eax+=stride; /* p_data += stride */
pand (mm5, mm6); /* mm5 &= 0xfefefefefefefefe */
pcmpeqb (mm5, mm4); /* is mm0 == 0 ? */
psubb (mm7, mm5); /* mm7 has running total of eqcnts */
movq (mm2, eax); /* mm2 = *p_data */
movq (mm0, mm3); /* mm0 = mm3 */
movq (mm3, mm2); /* mm3 = *p_data */
movq (mm1, mm0); /* mm1 = mm0 */
psubusb (mm0, mm2); /* mm0 -= mm2 */
eax+=stride; /* p_data += stride */
psubusb (mm2, mm1); /* mm2 -= mm1 */
por (mm0, mm2); /* mm0 |= mm2 */
pand (mm0, mm6); /* mm0 &= 0xfefefefefefefefe */
pcmpeqb (mm0, mm4); /* is mm0 == 0 ? */
psubb (mm7, mm0); /* mm7 has running total of eqcnts */
//pop eax
/* now mm7 contains negative eq_cnt for all 8-columns */
/* copy this to mm_data1 */
/* sum all 8 bytes in mm7 */
movq (mm1, mm7); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psrlq (mm7, 32); /* mm7 >>= 32 0 1 2 3 4 5 6 7m */
paddb (mm7, mm1); /* mm7 has running total of eqcnts */
movq (mm1, mm7); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psrlq (mm7, 16); /* mm7 >>= 16 0 1 2 3 4 5 6 7m */
paddb (mm1, mm7); /* mm7 has running total of eqcnts */
movq (mm7, mm1); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psrlq (mm7, 8); /* mm7 >>= 8 0 1 2 3 4 5 6 7m */
paddb (mm7, mm1); /* mm7 has running total of eqcnts */
movd (mm_data1, mm7); /* mm_data1 = mm7 */
eq_cnt = mm_data1 & 0xff;
useDC = (eq_cnt > DEBLOCK_VERT_USEDC_THR);
#ifdef PP_SELF_CHECK
if (useDC != useDC2) DPRINTF(_l("ERROR: MMX version of useDC is incorrect"));
#endif
return useDC;
}
/* decide whether the DC filter should be turned on accoding to QP */
static inline int deblock_vert_DC_on(uint8_t *v, stride_t stride, int QP)
{
__m64 QP_x_2=_mm_set1_pi32(0x02020202*QP);
for (int i=0; i<5; i++)
{
//if (abs(v[i+1*stride]-v[i+8*stride]) > 2 *QP) DC_on2 = 0;
/*
if (abs(v[i+0*stride]-v[i+5*stride]) >= 2*QP) return false;
if (abs(v[i+1*stride]-v[i+4*stride]) >= 2*QP) return false;
if (abs(v[i+1*stride]-v[i+8*stride]) >= 2*QP) return false;
if (abs(v[i+2*stride]-v[i+7*stride]) >= 2*QP) return false;
if (abs(v[i+3*stride]-v[i+6*stride]) >= 2*QP) return false;
*/
static const int v1[]={0,1,1,2,3},v2[]={5,4,8,7,6};
uint8_t *ptr1=&(v[v1[i]*stride]);
uint8_t *ptr2=&(v[v2[i]*stride]);
__m64 mm0=*(__m64*)ptr1,mm1=mm0;
__m64 mm2=*(__m64*)ptr2;
mm0=_mm_subs_pu8(mm0,mm2);
mm2=_mm_subs_pu8(mm2,mm1);
mm0=_mm_or_si64(mm0,mm2);
mm0=_mm_subs_pu8(mm0,QP_x_2);
mm1=mm0;
mm0=_mm_srli_si64(mm0,32);
mm0=_mm_or_si64(mm0,mm1);
int DC_on=_mm_cvtsi64_si32(mm0);
if (DC_on) return 0;
}
return 1;
}
/* function using MMX to copy an 8-pixel wide column and unpack to 16-bit values */
/* n is the number of rows to copy - this must be even */
static inline void deblock_vert_copy_and_unpack(stride_t stride, uint8_t *source, uint64_t *dest, int n)
{
uint64_t *pmm1 = (uint64_t *)source;
uint64_t *pmm2 = (uint64_t *)dest;
int i = -n / 2;
#ifdef PP_SELF_CHECK
int j, k;
#endif
/* copy block to local store whilst unpacking to 16-bit values */
unsigned char *eax=(unsigned char*)pmm1;
unsigned char *ebx=(unsigned char*)pmm2;
__m64 mm7=_mm_setzero_si64(); /* set mm7 = 0 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -