📄 nic_postprocess.cpp
字号:
__m64 mm0,mm1,mm2,mm3;
for (;i<0;i++)
{
movq (mm0, eax); /* mm0 = v[0*stride] */
#ifdef PREFETCH_ENABLE
prefetcht0 0[ebx]
#endif
eax+= stride; /* p_data += stride */
movq (mm1, mm0); /* mm1 = v[0*stride] */
punpcklbw (mm0, mm7); /* unpack low bytes (left hand 4) */
movq (mm2, eax); /* mm2 = v[0*stride] */
punpckhbw (mm1, mm7); /* unpack high bytes (right hand 4)*/
movq (mm3, mm2); /* mm3 = v[0*stride] */
punpcklbw (mm2, mm7); /* unpack low bytes (left hand 4) */
movq (ebx, mm0); /* v_local[n] = mm0 (left) */
eax+= stride; /* p_data += stride */
movq (8+ebx, mm1); /* v_local[n+8] = mm1 (right) */
punpckhbw (mm3, mm7); /* unpack high bytes (right hand 4)*/
movq (16+ebx, mm2); /* v_local[n+16] = mm2 (left) */
movq (24+ebx, mm3); /* v_local[n+24] = mm3 (right) */
ebx+= 32; /* p_data2 += 8 */
}
#ifdef PP_SELF_CHECK
/* check that MMX copy has worked correctly */
for (k=0; k<n; k++)
{
for (j=0; j<8; j++)
{
if ( ((uint16_t *)dest)[k*8+j] != source[k*stride+j] )
{
DPRINTF(_l("ERROR: MMX copy block is flawed at (%d, %d)"), j, k);
}
}
}
#endif
}
/* This function chooses the "endstops" for the vertial LPF9 filter: p1 and p2 */
/* We also convert these to 16-bit values here */
static inline void deblock_vert_choose_p1p2(uint8_t *v, stride_t stride, uint64_t *p1p2, int QP)
{
uint64_t *pmm1, *pmm2;
__m64 mm_b_qp = _mm_set1_pi32(0x01010101 * QP);
#ifdef PP_SELF_CHECK
int i;
#endif
pmm1 = (uint64_t *)(&(v[0*stride]));
pmm2 = (uint64_t *)(&(v[8*stride]));
unsigned char *eax= (unsigned char*)pmm1;
unsigned char *ebx= (unsigned char*)pmm2;
unsigned char *ecx= (unsigned char*)p1p2;
/* p1 */
__m64 mm7,mm0,mm2,mm1,mm3;
pxor (mm7, mm7); /* mm7 = 0 */
movq (mm0, eax); /* mm0 = *pmm1 = v[l0] */
movq (mm2, mm0); /* mm2 = mm0 = v[l0] */
eax+=stride; /* pmm1 += stride */
movq (mm1, eax); /* mm1 = *pmm1 = v[l1] */
movq (mm3, mm1); /* mm3 = mm1 = v[l1] */
psubusb (mm0, mm1); /* mm0 -= mm1 */
psubusb (mm1, mm2); /* mm1 -= mm2 */
por (mm0, mm1); /* mm0 |= mm1 */
psubusb (mm0, mm_b_qp); /* mm0 -= QP */
/* now a zero byte in mm0 indicates use v0 else use v1 */
pcmpeqb (mm0, mm7); /* zero bytes to ff others to 00 */
movq (mm1, mm0); /* make a copy of mm0 */
/* now ff byte in mm0 indicates use v0 else use v1 */
pandn (mm0, mm3); /* mask v1 into 00 bytes in mm0 */
pand (mm1, mm2); /* mask v0 into ff bytes in mm0 */
por (mm0, mm1); /* mm0 |= mm1 */
movq (mm1, mm0); /* make a copy of mm0 */
/* Now we have our result, p1, in mm0. Next, unpack. */
punpcklbw (mm0, mm7); /* low bytes to mm0 */
punpckhbw (mm1, mm7); /* high bytes to mm1 */
/* Store p1 in memory */
movq (ecx, mm0); /* low words to p1p2[0] */
movq (8+ecx, mm1); /* high words to p1p2[1] */
/* p2 */
movq (mm1, ebx); /* mm1 = *pmm2 = v[l8] */
movq (mm3, mm1); /* mm3 = mm1 = v[l8] */
ebx+=stride; /* pmm2 += stride */
movq (mm0, ebx); /* mm0 = *pmm2 = v[l9] */
movq (mm2, mm0); /* mm2 = mm0 = v[l9] */
psubusb (mm0, mm1); /* mm0 -= mm1 */
psubusb (mm1, mm2); /* mm1 -= mm2 */
por (mm0, mm1); /* mm0 |= mm1 */
psubusb (mm0, mm_b_qp); /* mm0 -= QP */
/* now a zero byte in mm0 indicates use v0 else use v1 */
pcmpeqb (mm0, mm7); /* zero bytes to ff others to 00 */
movq (mm1, mm0); /* make a copy of mm0 */
/* now ff byte in mm0 indicates use v0 else use v1 */
pandn (mm0, mm3); /* mask v1 into 00 bytes in mm0 */
pand (mm1, mm2); /* mask v0 into ff bytes in mm0 */
por (mm0, mm1); /* mm0 |= mm1 */
movq (mm1, mm0); /* make a copy of mm0 */
/* Now we have our result, p2, in mm0. Next, unpack. */
punpcklbw (mm0, mm7); /* low bytes to mm0 */
punpckhbw (mm1, mm7); /* high bytes to mm1 */
/* Store p2 in memory */
movq (16+ecx, mm0); /* low words to p1p2[2] */
movq (24+ecx, mm1); /* high words to p1p2[3] */
#ifdef PP_SELF_CHECK
/* check p1 and p2 have been calculated correctly */
/* p2 */
for (i=0; i<8; i++)
{
if ( ((abs(v[9*stride+i] - v[8*stride+i]) - QP > 0) ? v[8*stride+i] : v[9*stride+i])
!= ((uint16_t *)(&(p1p2[2])))[i] )
{
DPRINTF(_l("ERROR: problem with P2"));
}
}
/* p1 */
for (i=0; i<8; i++)
{
if ( ((abs(v[0*stride+i] - v[1*stride+i]) - QP > 0) ? v[1*stride+i] : v[0*stride+i])
!= ((uint16_t *)(&(p1p2[0])))[i] )
{
DPRINTF(_l("ERROR: problem with P1"));
}
}
#endif
}
/* Vertical 9-tap low-pass filter for use in "DC" regions of the picture */
static inline void deblock_vert_lpf9(uint64_t *v_local, uint64_t *p1p2, uint8_t *v, stride_t stride)
{
__m64 mm_fours = _mm_set1_pi16(0x0004);
#ifdef PP_SELF_CHECK
int j, k;
uint8_t selfcheck[64], *vv;
int p1, p2, psum;
/* define semi-constants to enable us to move up and down the picture easily... */
int l1 = 1 * stride;
int l2 = 2 * stride;
int l3 = 3 * stride;
int l4 = 4 * stride;
int l5 = 5 * stride;
int l6 = 6 * stride;
int l7 = 7 * stride;
int l8 = 8 * stride;
#endif
#ifdef PP_SELF_CHECK
/* generate a self-check version of the filter result in selfcheck[64] */
/* loop left->right */
for (j=0; j<8; j++)
{
vv = &(v[j]);
p1 = ((uint16_t *)(&(p1p2[0+j/4])))[j%4]; /* yuck! */
p2 = ((uint16_t *)(&(p1p2[2+j/4])))[j%4]; /* yuck! */
/* the above may well be endian-fussy */
psum = p1 + p1 + p1 + vv[l1] + vv[l2] + vv[l3] + vv[l4] + 4;
selfcheck[j+8*0] = (((psum + vv[l1]) << 1) - (vv[l4] - vv[l5])) >> 4;
psum += vv[l5] - p1;
selfcheck[j+8*1] = (((psum + vv[l2]) << 1) - (vv[l5] - vv[l6])) >> 4;
psum += vv[l6] - p1;
selfcheck[j+8*2] = (((psum + vv[l3]) << 1) - (vv[l6] - vv[l7])) >> 4;
psum += vv[l7] - p1;
selfcheck[j+8*3] = (((psum + vv[l4]) << 1) + p1 - vv[l1] - (vv[l7] - vv[l8])) >> 4;
psum += vv[l8] - vv[l1];
selfcheck[j+8*4] = (((psum + vv[l5]) << 1) + (vv[l1] - vv[l2]) - vv[l8] + p2) >> 4;
psum += p2 - vv[l2];
selfcheck[j+8*5] = (((psum + vv[l6]) << 1) + (vv[l2] - vv[l3])) >> 4;
psum += p2 - vv[l3];
selfcheck[j+8*6] = (((psum + vv[l7]) << 1) + (vv[l3] - vv[l4])) >> 4;
psum += p2 - vv[l4];
selfcheck[j+8*7] = (((psum + vv[l8]) << 1) + (vv[l4] - vv[l5])) >> 4;
}
#endif
/* vertical DC filter in MMX
mm2 - p1/2 left
mm3 - p1/2 right
mm4 - psum left
mm5 - psum right */
/* alternate between using mm0/mm1 and mm6/mm7 to accumlate left/right */
//push eax
//push ebx
//push ecx
unsigned char *eax=(unsigned char*)p1p2;
unsigned char *ebx=(unsigned char*)v_local;
unsigned char *ecx=(unsigned char*)v;
/* load p1 left into mm2 and p1 right into mm3 */
__m64 mm2;
movq ( mm2, eax); /* mm2 = p1p2[0] 0 1 2w3 4 5 6 7 */
ecx+= stride;// /* ecx points at v[1*stride] 0 1 2 3 4 5 6 7 */
__m64 mm3;
movq (mm3, 8+eax); /* mm3 = p1p2[1] 0 1 2 3w4 5 6 7 */
__m64 mm4;
movq (mm4, mm_fours); /* mm4 = 0x0004000400040004 0 1 2 3 4w5 6 7 */
/* psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4 */
/* psum left will be in mm4, right in mm5 */
__m64 mm5;
movq (mm5, mm4); /* mm5 = 0x0004000400040004 0 1 2 3 4 5w6 7 */
paddsw (mm4, 16+ebx); /* mm4 += vv[1] left 0 1 2 3 4m5 6 7 */
paddw (mm5, mm3); /* mm5 += p2 left 0 1 2 3r4 5m6 7 */
paddsw (mm4, 32+ebx); /* mm4 += vv[2] left 0 1 2 3 4m5 6 7 */
paddw (mm5, mm3); /* mm5 += p2 left 0 1 2 3r4 5m6 7 */
paddsw (mm4, 48+ebx); /* mm4 += vv[3] left 0 1 2 3 4m5 6 7 */
paddw (mm5, mm3); /* mm5 += p2 left 0 1 2 3r4 5m6 7 */
paddsw (mm5, 24+ebx); /* mm5 += vv[1] right 0 1 2 3 4 5m6 7 */
paddw (mm4, mm2); /* mm4 += p1 left 0 1 2r3 4m5 6 7 */
paddsw (mm5, 40+ebx); /* mm5 += vv[2] right 0 1 2 3 4 5m6 7 */
paddw (mm4, mm2); /* mm4 += p1 left 0 1 2r3 4m5 6 7 */
paddsw (mm5, 56+ebx); /* mm5 += vv[3] right 0 1 2 3 4 5m6 7 */
paddw (mm4, mm2); /* mm4 += p1 left 0 1 2r3 4m5 6 7 */
paddsw (mm4, 64+ebx); /* mm4 += vv[4] left 0 1 2 3 4m5 6 7 */
paddsw (mm5, 72+ebx); /* mm5 += vv[4] right 0 1 2 3 4 5m6 7 */
/* v[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4 */
/* compute this in mm0 (left) and mm1 (right) */
__m64 mm0,mm1;
movq (mm0, mm4); /* mm0 = psum left 0w1 2 3 4 5 6 7 */
paddsw (mm0, 16+ebx); /* mm0 += vv[1] left 0m1 2 3 4 5 6 7 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -