📄 nic_postprocess.cpp
字号:
movq (mm64_temp, mm0); /*temp = p2p2p2p2p1p1p1p1 0r1 2 3 4 5 6 7 */
punpcklbw (mm0, mm7); /* mm0 = __p1__p1__p1__p1 0m1 2 3 4 5 6 7 */
movq (mm5, mm6); /* mm5 = 0008000800080008 0 1 2 3 4 5w6r7 */
pmullw (mm0, ebx); /* mm0 *= mm64_coefs[0] 0m1 2 3 4 5 6 7 */
movq (mm1, mm2); /* mm1 = v4v4v3v3v2v2v1v1 0 1w2r3 4 5 6 7 */
punpcklbw (mm2, mm2); /* mm2 = v2v2v2v2v1v1v1v1 0 1 2m3 4 5 6 7 */
punpckhbw (mm1, mm1); /* mm1 = v4v4v4v4v3v3v3v3 0 1m2 3 4 5 6 7 */
#ifdef PREFETCH_ENABLE
prefetcht0 32[ebx]
#endif
movq (mm3, mm2); /* mm3 = v2v2v2v2v1v1v1v1 0 1 2r3w4 5 6 7 */
punpcklbw (mm2, mm7); /* mm2 = __v1__v1__v1__v1 0 1 2m3 4 5 6 7 */
punpckhbw (mm3, mm7); /* mm3 = __v2__v2__v2__v2 0 1 2 3m4 5 6 7 */
paddw (mm6, mm0); /* mm6 += mm0 0r1 2 3 4 5 6m7 */
movq (mm0, mm2); /* mm0 = __v1__v1__v1__v1 0w1 2r3 4 5 6 7 */
pmullw (mm0, 8+ebx); /* mm2 *= mm64_coefs[1] 0m1 2 3 4 5 6 7 */
movq (mm4, mm3); /* mm4 = __v2__v2__v2__v2 0 1 2 3r4w5 6 7 */
pmullw (mm2, 16+ebx); /* mm2 *= mm64_coefs[2] 0 1 2m3 4 5 6 7 */
pmullw (mm3, 32+ebx); /* mm3 *= mm64_coefs[4] 0 1 2 3m4 5 6 7 */
pmullw (mm4, 24+ebx); /* mm3 *= mm64_coefs[3] 0 1 2 3 4m5 6 7 */
paddw (mm5, mm0); /* mm5 += mm0 0r1 2 3 4 5m6 7 */
paddw (mm6, mm2); /* mm6 += mm2 0 1 2r3 4 5 6m7 */
movq (mm2, mm1); /* mm2 = v4v4v4v4v3v3v3v3 0 1 2 3 4 5 6 7 */
punpckhbw (mm2, mm7); /* mm2 = __v4__v4__v4__v4 0 1 2m3 4 5 6 7r */
paddw (mm5, mm4); /* mm5 += mm4 0 1 2 3 4r5m6 7 */
punpcklbw (mm1, mm7); /* mm1 = __v3__v3__v3__v3 0 1m2 3 4 5 6 7r */
paddw (mm6, mm3); /* mm6 += mm3 0 1 2 3r4 5 6m7 */
#ifdef PREFETCH_ENABLE
prefetcht0 64[ebx]
#endif
movq (mm0, mm1); /* mm0 = __v3__v3__v3__v3 0w1 2 3 4 5 6 7 */
pmullw (mm1, 48+ebx); /* mm1 *= mm64_coefs[6] 0 1m2 3 4 5 6 7 */
pmullw (mm0, 40+ebx); /* mm0 *= mm64_coefs[5] 0m1 2 3 4 5 6 7 */
movq (mm4, mm2); /* mm4 = __v4__v4__v4__v4 0 1 2r3 4w5 6 7 */
pmullw (mm2, 64+ebx); /* mm2 *= mm64_coefs[8] 0 1 2 3 4 5 6 7 */
paddw (mm6, mm1); /* mm6 += mm1 0 1 2 3 4 5 6 7 */
pmullw (mm4, 56+ebx); /* mm4 *= mm64_coefs[7] 0 1 2 3 4m5 6 7 */
pxor (mm3, mm3); /* mm3 = 0000000000000000 0 1 2 3w4 5 6 7 */
movq (mm1, 8+eax); /* mm1 = xxxxxxxxv8v7v6v5 0 1w2 3 4 5 6 7 */
paddw (mm5, mm0); /* mm5 += mm0 0r1 2 3 4 5 6 7 */
punpcklbw (mm1, mm1); /* mm1 = v8v8v7v7v6v6v5v5 0 1m2 3m4 5 6 7 */
paddw (mm6, mm2); /* mm6 += mm2 0 1 2r3 4 5 6 7 */
#ifdef PREFETCH_ENABLE
prefetcht0 96[ebx]
#endif
movq (mm2, mm1); /* mm2 = v8v8v7v7v6v6v5v5 0 1r2w3 4 5 6 7 */
paddw (mm5, mm4); /* mm5 += mm4 0 1 2 3 4r5 6 7 */
punpcklbw (mm2, mm2); /* mm2 = v6v6v6v6v5v5v5v5 0 1 2m3 4 5 6 7 */
punpckhbw (mm1, mm1); /* mm1 = v8v8v8v8v7v7v7v7 0 1m2 3 4 5 6 7 */
movq (mm3, mm2); /* mm3 = v6v6v6v6v5v5v5v5 0 1 2r3w4 5 6 7 */
punpcklbw (mm2, mm7); /* mm2 = __v5__v5__v5__v5 0 1 2m3 4 5 6 7r */
punpckhbw (mm3, mm7); /* mm3 = __v6__v6__v6__v6 0 1 2 3m4 5 6 7r */
movq (mm0, mm2); /* mm0 = __v5__v5__v5__v5 0w1 2b3 4 5 6 7 */
pmullw (mm0, 72+ebx); /* mm0 *= mm64_coefs[9] 0m1 2 3 4 5 6 7 */
movq (mm4, mm3); /* mm4 = __v6__v6__v6__v6 0 1 2 3 4w5 6 7 */
pmullw (mm2, 80+ebx); /* mm2 *= mm64_coefs[10] 0 1 2m3 4 5 6 7 */
pmullw (mm3, 96+ebx); /* mm3 *= mm64_coefs[12] 0 1 2 3m4 5 6 7 */
pmullw (mm4, 88+ebx); /* mm4 *= mm64_coefs[11] 0 1 2 3 4m5 6 7 */
paddw (mm5, mm0); /* mm5 += mm0 0r1 2 3 4 5 6 7 */
paddw (mm6, mm2); /* mm6 += mm2 0 1 2r3 4 5 6 7 */
movq (mm2, mm1); /* mm2 = v8v8v8v8v7v7v7v7 0 1r2w3 4 5 6 7 */
paddw (mm6, mm3); /* mm6 += mm3 0 1 2 3r4 5 6 7 */
punpcklbw (mm1, mm7); /* mm1 = __v7__v7__v7__v7 0 1m2 3 4 5 6 7r */
paddw (mm5, mm4); /* mm5 += mm4 0 1 2 3 4r5 6 7 */
punpckhbw (mm2, mm7); /* mm2 = __v8__v8__v8__v8 0 1 2m3 4 5 6 7 */
#ifdef PREFETCH_ENABLE
prefetcht0 128[ebx]
#endif
movq (mm3, mm64_temp); /* mm0 = p2p2p2p2p1p1p1p1 0 1 2 3w4 5 6 7 */
movq (mm0, mm1); /* mm0 = __v7__v7__v7__v7 0w1r2 3 4 5 6 7 */
pmullw (mm0, 104+ebx); /* mm0 *= mm64_coefs[13] 0m1b2 3 4 5 6 7 */
movq (mm4, mm2); /* mm4 = __v8__v8__v8__v8 0 1 2r3 4w5 6 7 */
pmullw (mm1, 112+ebx); /* mm1 *= mm64_coefs[14] 0 1w2 3 4 5 6 7 */
punpckhbw (mm3, mm7); /* mm0 = __p2__p2__p2__p2 0 1 2 3 4 5 6 7 */
pmullw (mm2, 128+ebx); /* mm2 *= mm64_coefs[16] 0 1b2m3 4 5 6 7 */
pmullw (mm4, 120+ebx); /* mm4 *= mm64_coefs[15] 0 1b2 3 4m5 6 7 */
paddw (mm5, mm0); /* mm5 += mm0 0r1 2 3 4 5m6 7 */
pmullw (mm3, 136+ebx); /* mm0 *= mm64_coefs[17] 0 1 2 3m4 5 6 7 */
paddw (mm6, mm1); /* mm6 += mm1 0 1w2 3 4 5 6m7 */
paddw (mm6, mm2); /* mm6 += mm2 0 1 2r3 4 5 6m7 */
paddw (mm5, mm4); /* mm5 += mm4 0 1 2 3 4r5m6 7 */
psrlw (mm6, 4); /* mm6 /= 16 0 1 2 3 4 5 6m7 */
paddw (mm5, mm3); /* mm6 += mm0 0 1 2 3r4 5m6 7 */
psrlw (mm5, 4); /* mm5 /= 16 0 1 2 3 4 5m6 7 */
packuswb (mm6, mm5); /* pack result into mm6 0 1 2 3 4 5r6m7 */
movq (4+eax, mm6); /* v[] = mm6 0 1 2 3 4 5 6r7 */
#ifdef PP_SELF_CHECK
for (i=1; i<=8; i++)
{
if (selfcheck[i] != v[i+y*stride])
{
DPRINTF(_l("ERROR: MMX version of horiz lpf9 is incorrect at %d"), i);
}
}
#endif
}
}
/* horizontal deblocking filter used in default (non-DC) mode */
static inline void deblock_horiz_default_filter(uint8_t *v, stride_t stride, int QP)
{
int a3_0, a3_1, a3_2, d;
int q1, q;
int y;
for (y=0; y<4; y++)
{
q1 = v[4] - v[5];
q = q1 / 2;
if (q)
{
a3_0 = q1;
a3_0 += a3_0 << 2;
a3_0 = 2*(v[3]-v[6]) - a3_0;
/* apply the 'delta' function first and check there is a difference to avoid wasting time */
if (abs(a3_0) < 8*QP)
{
a3_1 = v[3]-v[2];
a3_2 = v[7]-v[8];
a3_1 += a3_1 << 2;
a3_2 += a3_2 << 2;
a3_1 += (v[1]-v[4]) << 1;
a3_2 += (v[5]-v[8]) << 1;
d = abs(a3_0) - std::min(abs(a3_1), abs(a3_2));
if (d > 0) /* energy across boundary is greater than in one or both of the blocks */
{
d += d<<2;
d = (d + 32) >> 6;
if (d > 0)
{
//d *= SIGN(-a3_0);
/* clip d in the range 0 ... q */
if (q > 0)
{
if (a3_0 < 0)
{
//d = d<0 ? 0 : d;
d = d>q ? q : d;
v[4] = uint8_t(v[4]-d);
v[5] = uint8_t(v[5]+d);
}
}
else
{
if (a3_0 > 0)
{
//d = d>0 ? 0 : d;
d = (-d)<q ? q : (-d);
v[4] = uint8_t(v[4]-d);
v[5] = uint8_t(v[5]+d);
}
}
}
}
}
}
#ifdef PP_SELF_CHECK
/* no selfcheck written for this yet */
#endif
v += stride;
}
}
/* this is a horizontal deblocking filter - i.e. it will smooth _vertical_ block edges */
static void deblock_horiz(uint8_t *image, int width, stride_t stride, const int8_t *QP_store, int QP_stride, int chromaFlag, int DEBLOCK_HORIZ_USEDC_THR)
{
int x, y=0;
int QP;
uint8_t *v;
int useDC, DC_on;
#ifdef PREFETCH_AHEAD_H
void *prefetch_addr;
#endif
/* loop over every block boundary in that row */
for (x=8; x<width; x+=8)
{
/* extract QP from the decoder's array of QP values */
QP = chromaFlag ? QP_store[y/8*QP_stride+x/8]
: QP_store[y/16*QP_stride+x/16];
/* v points to pixel v0, in the left-hand block */
v = &(image[y*stride + x]) - 5;
#ifdef PREFETCH_AHEAD_V
/* try a prefetch PREFETCH_AHEAD_V bytes ahead on all eight rows... experimental */
prefetch_addr = v + PREFETCH_AHEAD_V;
_mm_prefetch(prefetch_addr+1*stride,_MM_HINT_T0);
_mm_prefetch(prefetch_addr+2*stride,_MM_HINT_T0);
_mm_prefetch(prefetch_addr+3*stride,_MM_HINT_T0);
_mm_prefetch(prefetch_addr+4*stride,_MM_HINT_T0);
#endif
/* first decide whether to use default or DC offet mode */
useDC = deblock_horiz_useDC(v, stride, DEBLOCK_HORIZ_USEDC_THR);
if (useDC) /* use DC offset mode */
{
DC_on = deblock_horiz_DC_on(v, stride, QP);
if (DC_on)
{
deblock_horiz_lpf9(v, stride, QP);
#ifdef SHOWDECISIONS_H
if (!chromaFlag)
{
v[0*stride + 4] =
v[1*stride + 4] =
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -