📄 nic_postprocess.cpp
字号:
#include "stdafx.h"
#include "nic_postProcess.h"
#include "postproc/postprocFilters.h"
#include "simd.h"
#include "Tconfig.h"
#pragma warning(push)
#pragma warning(disable:4309)
#pragma warning(disable:4799)
/******************* general, useful macros ****************/
//#define DEBUGMODE
//#define SELFCHECK
//#define PREFETCH
#ifdef PREFETCH
#define PREFETCH_AHEAD_V 8
#define PREFETCH_AHEAD_H 8
#define PREFETCH_ENABLE
#endif
#ifdef SELFCHECK
#include "ffdebug.h"
#define PP_SELF_CHECK
#define SELF_CHECK
#define SIGN(a) ( (a)<0 ? -1 : 1 )
#endif
#ifdef DEBUGMODE
#define SHOWDECISIONS_H
#define SHOW_DECISIONS
#define SHOWDECISIONS_V
#endif
/* decide DC mode or default mode for the horizontal filter */
static inline int deblock_horiz_useDC(uint8_t *v, stride_t stride, int DEBLOCK_HORIZ_USEDC_THR)
{
__m64 mm64_mask = _mm_set_pi8(0,-2,-2,-2,-2,-2,-2,-2);// (0x00,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe,0xfe);
int32_t mm32_result;
__m64 *pmm1;
int eq_cnt, useDC;
#ifdef PP_SELF_CHECK
int eq_cnt2, j, k;
#endif
pmm1 = (__m64*)(&(v[1])); /* this is a 32-bit aligned pointer, not 64-aligned */
// mov eax, pmm1
/* first load some constants into mm4, mm5, mm6, mm7 */
__m64 mm6=mm64_mask; // movq mm6, mm64_mask /*mm6 = 0x00fefefefefefefe */
__m64 mm4=_mm_setzero_si64();// pxor mm4, mm4 /*mm4 = 0x0000000000000000 */
__m64 mm1=*pmm1; // movq mm1, qword ptr [eax] /* mm1 = *pmm 0 1 2 3 4 5 6 7 */
pmm1+=stride/8; // add eax, stride /* eax += stride/8 0 1 2 3 4 5 6 7 */
__m64 mm5=mm1;// movq mm5, mm1 /* mm5 = mm1 0 1 2 3 4 5 6 7 */
mm1=_mm_srli_si64(mm1,8);// psrlq mm1, 8 /* mm1 >>= 8 0 1 2 3 4 5 6 7 */
__m64 mm2=mm5;// movq mm2, mm5 /* mm2 = mm5 0 1 2 3 4 5 6 7 */
mm5=_mm_subs_pu8(mm5,mm1);// psubusb mm5, mm1 /* mm5 -= mm1 0 1 2 3 4 5 6 7 */
__m64 mm3;
movq(mm3, *pmm1); /* mm3 = *pmm 0 1 2 3 4 5 6 7 */
psubusb(mm1, mm2); /* mm1 -= mm2 0 1 2 3 4 5 6 7 */
pmm1+=stride/8;//add eax, stride /* eax += stride/8 0 1 2 3 4 5 6 7 */
por(mm5, mm1); /* mm5 |= mm1 0 1 2 3 4 5 6 7 */
__m64 mm0;
movq( mm0, mm3); /* mm0 = mm3 0 1 2 3 4 5 6 7 */
pand( mm5, mm6); /* mm5 &= 0xfefefefefefefefe */
__m64 mm7;
pxor( mm7, mm7); /*mm7 = 0x0000000000000000 */
pcmpeqb( mm5, mm4); /* are the bytes of mm5 == 0 ? */
movq( mm1, *pmm1); /* mm3 = *pmm 0 1 2 3 4 5 6 7 */
psubb( mm7, mm5); /* mm7 has running total of eqcnts */
psrlq (mm3, 8); /* mm3 >>= 8 0 1 2 3 4 5 6 7 */
movq( mm5, mm0); /* mm5 = mm0 0 1 2 3 4 5 6 7 */
psubusb( mm0, mm3); /* mm0 -= mm3 0 1 2 3 4 5 6 7 */
pmm1+=stride/8;//add eax, stride /* eax += stride/8 0 1 2 3 4 5 6 7 */
psubusb (mm3, mm5); /* mm3 -= mm5 0 1 2 3 4 5 6 7 */
movq (mm5, *pmm1);// /* mm5 = *pmm 0 1 2 3 4 5 6 7 */
por (mm0, mm3); /* mm0 |= mm3 0 1 2 3 4 5 6 7 */
movq( mm3, mm1); /* mm3 = mm1 0 1 2 3 4 5 6 7 */
pand( mm0, mm6); /* mm0 &= 0xfefefefefefefefe */
psrlq( mm1, 8); /* mm1 >>= 8 0 1 2 3 4 5 6 7 */
pcmpeqb( mm0, mm4); /* are the bytes of mm0 == 0 ? */
movq( mm2, mm3); /* mm2 = mm3 0 1 2 3 4 5 6 7 */
psubb( mm7, mm0); /* mm7 has running total of eqcnts */
psubusb( mm3, mm1); /* mm3 -= mm1 0 1 2 3 4 5 6 7 */
psubusb( mm1, mm2); /* mm1 -= mm2 0 1 2 3 4 5 6 7 */
por( mm3, mm1); /* mm3 |= mm1 0 1 2 3 4 5 6 7 */
movq( mm1, mm5); /* mm1 = mm5 0 1 2 3 4 5 6 7 */
pand( mm3, mm6); /* mm3 &= 0xfefefefefefefefe */
psrlq( mm5, 8); /* mm5 >>= 8 0 1 2 3 4 5 6 7 */
pcmpeqb( mm3, mm4); /* are the bytes of mm3 == 0 ? */
movq ( mm0, mm1); /* mm0 = mm1 0 1 2 3 4 5 6 7 */
psubb ( mm7, mm3); /* mm7 has running total of eqcnts */
psubusb( mm1, mm5); /* mm1 -= mm5 0 1 2 3 4 5 6 7 */
psubusb( mm5, mm0); /* mm5 -= mm0 0 1 2 3 4 5 6 7 */
por ( mm1, mm5); /* mm1 |= mm5 0 1 2 3 4 5 6 7 */
pand ( mm1, mm6); /* mm1 &= 0xfefefefefefefefe */
pcmpeqb( mm1, mm4); /* are the bytes of mm1 == 0 ? */
psubb ( mm7, mm1); /* mm7 has running total of eqcnts */
movq (mm1, mm7); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psllq (mm7, 8); /* mm7 >>= 24 0 1 2 3 4 5 6 7m */
psrlq (mm1, 24); /* mm7 >>= 24 0 1 2 3 4 5 6 7m */
paddb (mm7, mm1); /* mm7 has running total of eqcnts */
movq (mm1, mm7); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psrlq (mm7, 16); /* mm7 >>= 16 0 1 2 3 4 5 6 7m */
paddb (mm7, mm1); /* mm7 has running total of eqcnts */
movq (mm1, mm7); /* mm1 = mm7 0 1w2 3 4 5 6 7r */
psrlq ( mm7, 8); /* mm7 >>= 8 0 1 2 3 4 5 6 7m */
paddb (mm7, mm1); /* mm7 has running total of eqcnts */
movd (mm32_result, mm7);
//pop eax
eq_cnt = mm32_result & 0xff;
#ifdef PP_SELF_CHECK
eq_cnt2 = 0;
for (k=0; k<4; k++)
{
for (j=1; j<=7; j++)
{
if (abs(v[j+k*stride]-v[1+j+k*stride]) <= 1) eq_cnt2++;
}
}
if (eq_cnt2 != eq_cnt)
DPRINTF(_l("ERROR: MMX version of useDC is incorrect"));
#endif
useDC = eq_cnt >= DEBLOCK_HORIZ_USEDC_THR;
return useDC;
}
/* decide whether the DC filter should be turned on according to QP */
static inline int deblock_horiz_DC_on(uint8_t *v, stride_t stride, int QP)
{
/* 99% of the time, this test turns out the same as the |max-min| strategy in the standard */
// return (abs(v[1]-v[8]) < 2*QP);
for (int i=0; i<4; ++i)
{
if (abs(v[0]-v[5]) >= 2*QP) return false;
if (abs(v[1]-v[8]) >= 2*QP) return false;
if (abs(v[1]-v[4]) >= 2*QP) return false;
if (abs(v[2]-v[7]) >= 2*QP) return false;
if (abs(v[3]-v[6]) >= 2*QP) return false;
v += stride;
}
return true;
}
/* The 9-tap low pass filter used in "DC" regions */
/* I'm not sure that I like this implementation any more...! */
static inline void deblock_horiz_lpf9(uint8_t *v, stride_t stride, int QP)
{
int y, p1, p2;
#ifdef PP_SELF_CHECK
uint8_t selfcheck[9];
int psum;
uint8_t *vv;
int i;
#endif
uint8_t *pmm1;
uint32_t mm32_p1p2;
static const uint64_t mm64_coefs[18] =
{
0x0001000200040006LL, /* p1 left */ 0x0000000000000001LL, /* v1 right */
0x0001000200020004LL, /* v1 left */ 0x0000000000010001LL, /* v2 right */
0x0002000200040002LL, /* v2 left */ 0x0000000100010002LL, /* v3 right */
0x0002000400020002LL, /* v3 left */ 0x0001000100020002LL, /* v4 right */
0x0004000200020001LL, /* v4 left */ 0x0001000200020004LL, /* v5 right */
0x0002000200010001LL, /* v5 left */ 0x0002000200040002LL, /* v6 right */
0x0002000100010000LL, /* v6 left */ 0x0002000400020002LL, /* v7 right */
0x0001000100000000LL, /* v7 left */ 0x0004000200020001LL, /* v8 right */
0x0001000000000000LL, /* v8 left */ 0x0006000400020001LL /* p2 right */
};
__m64 mm64_0008 = _mm_set1_pi16(0x0008);
__m64 mm0,mm2,mm7,mm6,mm5,mm1,mm3,mm4;
__m64 mm64_temp;
for (y=0; y<4; y++)
{
p1 = (abs(v[0+y*stride]-v[1+y*stride]) < QP ) ? v[0+y*stride] : v[1+y*stride];
p2 = (abs(v[8+y*stride]-v[9+y*stride]) < QP ) ? v[9+y*stride] : v[8+y*stride];
mm32_p1p2 = 0x0101 * ((p2 << 16) + p1);
#ifdef PP_SELF_CHECK
/* generate a self-check version of the filter result in selfcheck[9] */
/* low pass filtering (LPF9: 1 1 2 2 4 2 2 1 1) */
vv = &(v[y*stride]);
psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4;
selfcheck[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4;
psum += vv[5] - p1;
selfcheck[2] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4;
psum += vv[6] - p1;
selfcheck[3] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4;
psum += vv[7] - p1;
selfcheck[4] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4;
psum += vv[8] - vv[1];
selfcheck[5] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4;
psum += p2 - vv[2];
selfcheck[6] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4;
psum += p2 - vv[3];
selfcheck[7] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4;
psum += p2 - vv[4];
selfcheck[8] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4;
#endif
pmm1 = (&(v[y*stride-3])); /* this is 64-aligned */
/* mm7 = 0, mm6 is left hand accumulator, mm5 is right hand acc */
unsigned char *eax=(unsigned char*)pmm1;
unsigned char *ebx=(unsigned char*)&mm64_coefs[0];
#ifdef PREFETCH_ENABLE
prefetcht0 32[ebx]
#endif
movd (mm0, mm32_p1p2); /* mm0 = ________p2p2p1p1 0w1 2 3 4 5 6 7 */
punpcklbw (mm0, mm0); /* mm0 = p2p2p2p2p1p1p1p1 0m1 2 3 4 5 6 7 */
movq (mm2, eax); /* mm2 = v4v3v2v1xxxxxxxx 0 1 2w3 4 5 6 7 */
pxor (mm7, mm7); /* mm7 = 0000000000000000 0 1 2 3 4 5 6 7w */
movq (mm6, mm64_0008); /* mm6 = 0008000800080008 0 1 2 3 4 5 6w7 */
punpckhbw (mm2, mm2); /* mm2 = v4__v3__v2__v1__ 0 1 2m3 4 5 6 7 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -