📄 postprocess_template.c
字号:
"movq %%mm3, (%0) \n\t" // X
// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
"movq %%mm1, %%mm0 \n\t" // 1
PAVGB(%%mm6, %%mm0) //1 1 /2
"movq %%mm4, %%mm3 \n\t" // 1
PAVGB((%0,%1,2), %%mm3) // 1 1 /2
PAVGB((%%REGa,%1,2), %%mm5) // 11 /2
PAVGB((%%REGa), %%mm5) // 211 /4
PAVGB(%%mm5, %%mm3) // 2 2211 /8
PAVGB(%%mm0, %%mm3) //4242211 /16
"movq %%mm3, (%0,%1) \n\t" // X
// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
PAVGB(%%mm4, %%mm6) //11 /2
"movq (%%"REG_c"), %%mm0 \n\t" // 1
PAVGB((%%REGa, %1, 2), %%mm0) // 11/2
"movq %%mm0, %%mm3 \n\t" // 11/2
PAVGB(%%mm1, %%mm0) // 2 11/4
PAVGB(%%mm6, %%mm0) //222 11/8
PAVGB(%%mm2, %%mm0) //22242211/16
"movq (%0, %1, 2), %%mm2 \n\t" // 1
"movq %%mm0, (%0, %1, 2) \n\t" // X
// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
PAVGB((%%REGc), %%mm0) // 11 /2
PAVGB(%%mm0, %%mm6) //11 11 /4
PAVGB(%%mm1, %%mm4) // 11 /2
PAVGB(%%mm2, %%mm1) // 11 /2
PAVGB(%%mm1, %%mm6) //1122 11 /8
PAVGB(%%mm5, %%mm6) //112242211 /16
"movq (%%"REG_a"), %%mm5 \n\t" // 1
"movq %%mm6, (%%"REG_a") \n\t" // X
// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
"movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1
PAVGB(%%mm7, %%mm6) // 11 /2
PAVGB(%%mm4, %%mm6) // 11 11 /4
PAVGB(%%mm3, %%mm6) // 11 2211 /8
PAVGB(%%mm5, %%mm2) // 11 /2
"movq (%0, %1, 4), %%mm4 \n\t" // 1
PAVGB(%%mm4, %%mm2) // 112 /4
PAVGB(%%mm2, %%mm6) // 112242211 /16
"movq %%mm6, (%0, %1, 4) \n\t" // X
// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
PAVGB(%%mm7, %%mm1) // 11 2 /4
PAVGB(%%mm4, %%mm5) // 11 /2
PAVGB(%%mm5, %%mm0) // 11 11 /4
"movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1
PAVGB(%%mm6, %%mm1) // 11 4 2 /8
PAVGB(%%mm0, %%mm1) // 11224222 /16
"movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X
// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
PAVGB((%%REGc), %%mm2) // 112 4 /8
"movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1
PAVGB(%%mm0, %%mm6) // 1 1 /2
PAVGB(%%mm7, %%mm6) // 1 12 /4
PAVGB(%%mm2, %%mm6) // 1122424 /4
"movq %%mm6, (%%"REG_c") \n\t" // X
// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
PAVGB(%%mm7, %%mm5) // 11 2 /4
PAVGB(%%mm7, %%mm5) // 11 6 /8
PAVGB(%%mm3, %%mm0) // 112 /4
PAVGB(%%mm0, %%mm5) // 112246 /16
"movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X
"sub %1, %0 \n\t"
:
: "r" (src), "r" ((long)stride), "m" (c->pQPb)
: "%"REG_a, "%"REG_c
);
#else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
const int l7= stride + l6;
const int l8= stride + l7;
const int l9= stride + l8;
int x;
src+= stride*3;
for(x=0; x<BLOCK_SIZE; x++)
{
const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
int sums[10];
sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
sums[1] = sums[0] - first + src[l4];
sums[2] = sums[1] - first + src[l5];
sums[3] = sums[2] - first + src[l6];
sums[4] = sums[3] - first + src[l7];
sums[5] = sums[4] - src[l1] + src[l8];
sums[6] = sums[5] - src[l2] + last;
sums[7] = sums[6] - src[l3] + last;
sums[8] = sums[7] - src[l4] + last;
sums[9] = sums[8] - src[l5] + last;
src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
src++;
}
#endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
}
#endif //HAVE_ALTIVEC
#if 0
/**
* Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
* values are correctly clipped (MMX2)
* values are wraparound (C)
* Conclusion: It is fast, but introduces ugly horizontal patterns
* if there is a continuous gradient.
0 8 16 24
x = 8
x/2 = 4
x/8 = 1
1 12 12 23
*/
static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
src+= stride*3;
// FIXME rounding
asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
"movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE
"leal (%0, %1), %%"REG_a" \n\t"
"leal (%%"REG_a", %1, 4), %%"REG_c" \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
"movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP
"movq %%mm0, %%mm1 \n\t" // QP,..., QP
"paddusb "MANGLE(b02)", %%mm0 \n\t"
"psrlw $2, %%mm0 \n\t"
"pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4
"paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ...
"movq (%0, %1, 4), %%mm2 \n\t" // line 4
"movq (%%"REG_c"), %%mm3 \n\t" // line 5
"movq %%mm2, %%mm4 \n\t" // line 4
"pcmpeqb %%mm5, %%mm5 \n\t" // -1
"pxor %%mm2, %%mm5 \n\t" // -line 4 - 1
PAVGB(%%mm3, %%mm5)
"paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2
"psubusb %%mm3, %%mm4 \n\t"
"psubusb %%mm2, %%mm3 \n\t"
"por %%mm3, %%mm4 \n\t" // |l4 - l5|
"psubusb %%mm0, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm4 \n\t"
"pand %%mm4, %%mm5 \n\t" // d/2
// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80
"paddb %%mm5, %%mm2 \n\t"
// "psubb %%mm6, %%mm2 \n\t"
"movq %%mm2, (%0,%1, 4) \n\t"
"movq (%%"REG_c"), %%mm2 \n\t"
// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80
"psubb %%mm5, %%mm2 \n\t"
// "psubb %%mm6, %%mm2 \n\t"
"movq %%mm2, (%%"REG_c") \n\t"
"paddb %%mm6, %%mm5 \n\t"
"psrlw $2, %%mm5 \n\t"
"pand "MANGLE(b3F)", %%mm5 \n\t"
"psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8
"movq (%%"REG_a", %1, 2), %%mm2 \n\t"
"paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80
"paddsb %%mm5, %%mm2 \n\t"
"psubb %%mm6, %%mm2 \n\t"
"movq %%mm2, (%%"REG_a", %1, 2) \n\t"
"movq (%%"REG_c", %1), %%mm2 \n\t"
"paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80
"psubsb %%mm5, %%mm2 \n\t"
"psubb %%mm6, %%mm2 \n\t"
"movq %%mm2, (%%"REG_c", %1) \n\t"
:
: "r" (src), "r" ((long)stride)
: "%"REG_a, "%"REG_c
);
#else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
const int l1= stride;
const int l2= stride + l1;
const int l3= stride + l2;
const int l4= stride + l3;
const int l5= stride + l4;
const int l6= stride + l5;
// const int l7= stride + l6;
// const int l8= stride + l7;
// const int l9= stride + l8;
int x;
const int QP15= QP + (QP>>2);
src+= stride*3;
for(x=0; x<BLOCK_SIZE; x++)
{
const int v = (src[x+l5] - src[x+l4]);
if(FFABS(v) < QP15)
{
src[x+l3] +=v>>3;
src[x+l4] +=v>>1;
src[x+l5] -=v>>1;
src[x+l6] -=v>>3;
}
}
#endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
}
#endif //0
/**
* Experimental Filter 1
* will not damage linear gradients
* Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
* can only smooth blocks at the expected locations (it cannot smooth them if they did move)
* MMX2 version does correct clipping C version does not
*/
static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
{
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
src+= stride*3;
asm volatile(
"pxor %%mm7, %%mm7 \n\t" // 0
"lea (%0, %1), %%"REG_a" \n\t"
"lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
// 0 1 2 3 4 5 6 7 8 9
// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
"movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3
"movq (%0, %1, 4), %%mm1 \n\t" // line 4
"movq %%mm1, %%mm2 \n\t" // line 4
"psubusb %%mm0, %%mm1 \n\t"
"psubusb %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm0 \n\t" // |l2 - l3|
"movq (%%"REG_c"), %%mm3 \n\t" // line 5
"movq (%%"REG_c", %1), %%mm4 \n\t" // line 6
"movq %%mm3, %%mm5 \n\t" // line 5
"psubusb %%mm4, %%mm3 \n\t"
"psubusb %%mm5, %%mm4 \n\t"
"por %%mm4, %%mm3 \n\t" // |l5 - l6|
PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
"movq %%mm2, %%mm1 \n\t" // line 4
"psubusb %%mm5, %%mm2 \n\t"
"movq %%mm2, %%mm4 \n\t"
"pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
"psubusb %%mm1, %%mm5 \n\t"
"por %%mm5, %%mm4 \n\t" // |l4 - l5|
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -