📄 deblock_horiz_lpf9.c
字号:
#include "postprocess_mmx.h"
const static uint64_t mm64_0008 = 0x0008000800080008;
const static uint64_t mm64_0101 = 0x0101010101010101;
static uint64_t mm64_temp;
const static uint64_t mm64_coefs[18] = {
0x0001000200040006, /* p1 left */ 0x0000000000000001, /* v1 right */
0x0001000200020004, /* v1 left */ 0x0000000000010001, /* v2 right */
0x0002000200040002, /* v2 left */ 0x0000000100010002, /* v3 right */
0x0002000400020002, /* v3 left */ 0x0001000100020002, /* v4 right */
0x0004000200020001, /* v4 left */ 0x0001000200020004, /* v5 right */
0x0002000200010001, /* v5 left */ 0x0002000200040002, /* v6 right */
0x0002000100010000, /* v6 left */ 0x0002000400020002, /* v7 right */
0x0001000100000000, /* v7 left */ 0x0004000200020001, /* v8 right */
0x0001000000000000, /* v8 left */ 0x0006000400020001 /* p2 right */
};
static uint32_t mm32_p1p2;
static uint8_t *pmm1;
INLINE void deblock_horiz_lpf9(uint8_t *v, int stride, int QP) {
int y, p1, p2;
#ifdef PP_SELF_CHECK
uint8_t selfcheck[9];
int psum;
uint8_t *vv;
int i;
#endif
for (y=0; y<4; y++) {
p1 = (ABS(v[0+y*stride]-v[1+y*stride]) < QP ) ? v[0+y*stride] : v[1+y*stride];
p2 = (ABS(v[8+y*stride]-v[9+y*stride]) < QP ) ? v[9+y*stride] : v[8+y*stride];
mm32_p1p2 = 0x0101 * ((p2 << 16) + p1);
#ifdef PP_SELF_CHECK
vv = &(v[y*stride]);
psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4;
selfcheck[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4;
psum += vv[5] - p1;
selfcheck[2] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4;
psum += vv[6] - p1;
selfcheck[3] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4;
psum += vv[7] - p1;
selfcheck[4] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4;
psum += vv[8] - vv[1];
selfcheck[5] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4;
psum += p2 - vv[2];
selfcheck[6] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4;
psum += p2 - vv[3];
selfcheck[7] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4;
psum += p2 - vv[4];
selfcheck[8] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4;
#endif
pmm1 = (&(v[y*stride-3])); __asm {
push eax
push ebx
mov eax, pmm1
lea ebx, mm64_coefs
#ifdef PREFETCH_ENABLE
prefetcht0 32[ebx]
#endif
movd mm0, mm32_p1p2
punpcklbw mm0, mm0
movq mm2, qword ptr [eax]
pxor mm7, mm7
movq mm6, mm64_0008
punpckhbw mm2, mm2
movq mm64_temp, mm0
punpcklbw mm0, mm7
movq mm5, mm6
pmullw mm0, [ebx]
movq mm1, mm2
punpcklbw mm2, mm2
punpckhbw mm1, mm1
#ifdef PREFETCH_ENABLE
prefetcht0 32[ebx]
#endif
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
paddw mm6, mm0
movq mm0, mm2
pmullw mm0, 8[ebx]
movq mm4, mm3
pmullw mm2, 16[ebx]
pmullw mm3, 32[ebx]
pmullw mm4, 24[ebx]
paddw mm5, mm0
paddw mm6, mm2
movq mm2, mm1
punpckhbw mm2, mm7
paddw mm5, mm4
punpcklbw mm1, mm7
paddw mm6, mm3
#ifdef PREFETCH_ENABLE
prefetcht0 64[ebx]
#endif
movq mm0, mm1
pmullw mm1, 48[ebx]
pmullw mm0, 40[ebx]
movq mm4, mm2
pmullw mm2, 64[ebx]
paddw mm6, mm1
pmullw mm4, 56[ebx]
pxor mm3, mm3
movq mm1, 8[eax]
paddw mm5, mm0
punpcklbw mm1, mm1
paddw mm6, mm2
#ifdef PREFETCH_ENABLE
prefetcht0 96[ebx]
#endif
movq mm2, mm1
paddw mm5, mm4
punpcklbw mm2, mm2
punpckhbw mm1, mm1
movq mm3, mm2
punpcklbw mm2, mm7
punpckhbw mm3, mm7
movq mm0, mm2
pmullw mm0, 72[ebx]
movq mm4, mm3
pmullw mm2, 80[ebx]
pmullw mm3, 96[ebx]
pmullw mm4, 88[ebx]
paddw mm5, mm0
paddw mm6, mm2
movq mm2, mm1
paddw mm6, mm3
punpcklbw mm1, mm7
paddw mm5, mm4
punpckhbw mm2, mm7
#ifdef PREFETCH_ENABLE
prefetcht0 128[ebx]
#endif
movq mm3, mm64_temp
movq mm0, mm1
pmullw mm0, 104[ebx]
movq mm4, mm2
pmullw mm1, 112[ebx]
punpckhbw mm3, mm7
pmullw mm2, 128[ebx]
pmullw mm4, 120[ebx]
paddw mm5, mm0
pmullw mm3, 136[ebx]
paddw mm6, mm1
paddw mm6, mm2
paddw mm5, mm4
psrlw mm6, 4
paddw mm5, mm3
psrlw mm5, 4
packuswb mm6, mm5
movq 4[eax], mm6
pop ebx
pop eax
};
#ifdef PP_SELF_CHECK
for (i=1; i<=8; i++) {
if (selfcheck[i] != v[i+y*stride]) {
printf("ERROR: MMX version of horiz lpf9 is incorrect at %d\n", i);
}
}
#endif
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -