📄 postprocess_template.c
字号:
"por %%mm3, %%mm1 \n\t" // L00 "por %%mm4, %%mm2 \n\t" // L20 "movq %%mm1, %%mm3 \n\t" // L00 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 "psubusb %%mm7, %%mm0 \n\t" "psubusb %%mm7, %%mm2 \n\t" "psubusb %%mm7, %%mm3 \n\t" "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 "paddb %%mm2, %%mm0 \n\t" "paddb %%mm3, %%mm0 \n\t" "movq (%%eax), %%mm2 \n\t" // L11 "movq %%mm2, %%mm3 \n\t" // L11 "movq %%mm2, %%mm4 \n\t" // L11 "psllq $8, %%mm3 \n\t" "psrlq $8, %%mm4 \n\t" "movd -4(%%eax), %%mm5 \n\t" "movd 8(%%eax), %%mm6 \n\t" "psrlq $24, %%mm5 \n\t" "psllq $56, %%mm6 \n\t" "por %%mm5, %%mm3 \n\t" // L01 "por %%mm6, %%mm4 \n\t" // L21 "movq %%mm3, %%mm5 \n\t" // L01 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 "psubusb %%mm7, %%mm2 \n\t" "psubusb %%mm7, %%mm4 \n\t" "psubusb %%mm7, %%mm5 \n\t" "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 "paddb %%mm4, %%mm2 \n\t" "paddb %%mm5, %%mm2 \n\t"// 0, 2, 3, 1#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ "movq " #src ", " #sx " \n\t" /* src[0] */\ "movq " #sx ", " #lx " \n\t" /* src[0] */\ "movq " #sx ", " #t0 " \n\t" /* src[0] */\ "psllq $8, " #lx " \n\t"\ "psrlq $8, " #t0 " \n\t"\ "movd -4" #src ", " #t1 " \n\t"\ "psrlq $24, " #t1 " \n\t"\ "por " #t1 ", " #lx " \n\t" /* src[-1] */\ "movd 8" #src ", " #t1 " \n\t"\ "psllq $56, " #t1 " \n\t"\ "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ PAVGB(lx, pplx) \ "movq " #lx ", 8(%%ecx) \n\t"\ "movq (%%ecx), " #lx " \n\t"\ "psubusb " #lx ", " #t1 " \n\t"\ "psubusb " #lx ", " #t0 " \n\t"\ "psubusb " #lx ", " #sx " \n\t"\ "movq "MANGLE(b00)", " #lx " \n\t"\ "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ "paddb " #t1 ", " #t0 " \n\t"\ "paddb " #t0 ", " #sx " \n\t"\\ PAVGB(plx, pplx) /* filtered */\ "movq " #dst ", " #t0 " \n\t" /* dst */\ "movq " #t0 ", " #t1 " \n\t" /* dst */\ "psubusb %3, " #t0 " \n\t"\ "paddusb %3, " #t1 " \n\t"\ PMAXUB(t0, pplx)\ PMINUB(t1, pplx, t0)\ "paddb " #sx ", " #ppsx " \n\t"\ "paddb " #psx ", " #ppsx " \n\t"\ "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ "pand "MANGLE(b08)", " #ppsx " \n\t"\ "pcmpeqb " #lx ", " #ppsx " \n\t"\ "pand " #ppsx ", " #pplx " \n\t"\ "pandn " #dst ", " #ppsx " \n\t"\ "por " #pplx ", " #ppsx " \n\t"\ "movq " #ppsx ", " #dst " \n\t"\ "movq 8(%%ecx), " #lx " \n\t"/*0000000111111111111101111101111110011110111111010111100111110001110111*///DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) "1: \n\t" : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) : "%eax", "%edx", "%ecx" );#else int y; int min=255; int max=0; int avg; uint8_t *p; int s[10]; const int QP2= c->QP/2 + 1; for(y=1; y<9; y++) { int x; p= src + stride*y; for(x=1; x<9; x++) { p++; if(*p > max) max= *p; if(*p < min) min= *p; } } avg= (min + max + 1)>>1; if(max - min <deringThreshold) return; for(y=0; y<10; y++) { int t = 0; if(src[stride*y + 0] > avg) t+= 1; if(src[stride*y + 1] > avg) t+= 2; if(src[stride*y + 2] > avg) t+= 4; if(src[stride*y + 3] > avg) t+= 8; if(src[stride*y + 4] > avg) t+= 16; if(src[stride*y + 5] > avg) t+= 32; if(src[stride*y + 6] > avg) t+= 64; if(src[stride*y + 7] > avg) t+= 128; if(src[stride*y + 8] > avg) t+= 256; if(src[stride*y + 9] > avg) t+= 512; t |= (~t)<<16; t &= (t<<1) & (t>>1); s[y] = t; } for(y=1; y<9; y++) { int t = s[y-1] & s[y] & s[y+1]; t|= t>>16; s[y-1]= t; } for(y=1; y<9; y++) { int x; int t = s[y-1]; p= src + stride*y; for(x=1; x<9; x++) { p++; if(t & (1<<x)) { int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); f= (f + 8)>>4;#ifdef DEBUG_DERING_THRESHOLD asm volatile("emms\n\t":); { static long long numPixels=0; if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;// if((max-min)<20 || (max-min)*QP<200)// if((max-min)*QP < 500)// if(max-min<QP/2) if(max-min < 20) { static int numSkiped=0; static int errorSum=0; static int worstQP=0; static int worstRange=0; static int worstDiff=0; int diff= (f - *p); int absDiff= ABS(diff); int error= diff*diff; if(x==1 || x==8 || y==1 || y==8) continue; numSkiped++; if(absDiff > worstDiff) { worstDiff= absDiff; worstQP= QP; worstRange= max-min; } errorSum+= error; if(1024LL*1024LL*1024LL % numSkiped == 0) { printf( "sum:%1.3f, skip:%d, wQP:%d, " "wRange:%d, wDiff:%d, relSkip:%1.3f\n", (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, worstDiff, (float)numSkiped/numPixels); } } }#endif if (*p + QP2 < f) *p= *p + QP2; else if(*p - QP2 > f) *p= *p - QP2; else *p=f; } } }#ifdef DEBUG_DERING_THRESHOLD if(max-min < 20) { for(y=1; y<9; y++) { int x; int t = 0; p= src + stride*y; for(x=1; x<9; x++) { p++; *p = MIN(*p + 20, 255); } }// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; }#endif#endif}/** * Deinterlaces the given block by linearly interpolating every second line. * will be called for every 8x8 block and can read & write from line 4-15 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too * lines 4-12 will be read into the deblocking filter and should be deinterlaced */static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride){#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= 4*stride; asm volatile( "leal (%0, %1), %%eax \n\t" "leal (%%eax, %1, 4), %%ecx \n\t"// 0 1 2 3 4 5 6 7 8 9// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq (%0), %%mm0 \n\t" "movq (%%eax, %1), %%mm1 \n\t" PAVGB(%%mm1, %%mm0) "movq %%mm0, (%%eax) \n\t" "movq (%0, %1, 4), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) "movq %%mm1, (%%eax, %1, 2) \n\t" "movq (%%ecx, %1), %%mm1 \n\t" PAVGB(%%mm1, %%mm0) "movq %%mm0, (%%ecx) \n\t" "movq (%0, %1, 8), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) "movq %%mm1, (%%ecx, %1, 2) \n\t" : : "r" (src), "r" (stride) : "%eax", "%ecx" );#else int a, b, x; src+= 4*stride; for(x=0; x<2; x++){ a= *(uint32_t*)&src[stride*0]; b= *(uint32_t*)&src[stride*2]; *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); a= *(uint32_t*)&src[stride*4]; *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); b= *(uint32_t*)&src[stride*6]; *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); a= *(uint32_t*)&src[stride*8]; *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); src += 4; }#endif}/** * Deinterlaces the given block by cubic interpolating every second line. * will be called for every 8x8 block and can read & write from line 4-15 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too * lines 4-12 will be read into the deblocking filter and should be deinterlaced * this filter will read lines 3-15 and write 7-13 */static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride){#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*3; asm volatile( "leal (%0, %1), %%eax \n\t" "leal (%%eax, %1, 4), %%edx \n\t" "leal (%%edx, %1, 4), %%ecx \n\t" "addl %1, %%ecx \n\t" "pxor %%mm7, %%mm7 \n\t"// 0 1 2 3 4 5 6 7 8 9 10// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx#define DEINT_CUBIC(a,b,c,d,e)\ "movq " #a ", %%mm0 \n\t"\ "movq " #b ", %%mm1 \n\t"\ "movq " #d ", %%mm2 \n\t"\ "movq " #e ", %%mm3 \n\t"\ PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ "movq %%mm0, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpckhbw %%mm7, %%mm2 \n\t"\ "movq %%mm1, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ "packuswb %%mm3, %%mm1 \n\t"\ "movq %%mm1, " #c " \n\t"DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) : : "r" (src), "r" (stride) : "%eax", "%edx", "ecx" );#else int x; src+= stride*3; for(x=0; x<8; x++) { src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); src++; }#endif}/** * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. * will be called for every 8x8 block and can read & write from line 4-15 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too * lines 4-12 will be read into the deblocking filter and should be deinterlaced * this filter will read lines 4-13 and write 5-11 */static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp){#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*4; asm volatile( "leal (%0, %1), %%eax \n\t" "leal (%%eax, %1, 4), %%edx \n\t" "pxor %%mm7, %%mm7 \n\t" "movq (%2), %%mm0 \n\t"// 0 1 2 3 4 5 6 7 8 9 10// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx#define DEINT_FF(a,b,c,d)\ "movq " #a ", %%mm1 \n\t"\ "movq " #b ", %%mm2 \n\t"\ "movq " #c ", %%mm3 \n\t"\ "movq " #d ", %%mm4 \n\t"\ PAVGB(%%mm3, %%mm1) \ PAVGB(%%mm4, %%mm0) \ "movq %%mm0, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm0 \n\t"\ "punpckhbw %%mm7, %%mm3 \n\t"\ "movq %%mm1, %%mm4 \n\t"\ "punpcklbw %%mm7, %%mm1 \n\t"\ "punpckhbw %%mm7, %%mm4 \n\t"\ "psllw $2, %%mm1 \n\t"\ "psllw $2, %%mm4 \n\t"\ "psubw %%mm0, %%mm1 \n\t"\ "psubw %%mm3, %%mm4 \n\t"\ "movq %%mm2, %%mm5 \n\t"\ "movq %%mm2, %%mm0 \n\t"\ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpckhbw %%mm7, %%mm5 \n\t"\ "paddw %%mm2, %%mm1 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ "psraw $2, %%mm1 \n\t"\ "psraw $2, %%mm4 \n\t"\ "packuswb %%mm4, %%mm1 \n\t"\ "movq %%mm1, " #b " \n\t"\DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2))DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) )DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2))DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) "movq %%mm0, (%2) \n\t" : : "r" (src), "r" (stride), "r"(tmp) : "%eax", "%edx" );#else int x; src+= stride*4; for(x=0; x<8; x++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -