timgfilterpostproc.cpp

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 1,961 行 · 第 1/5 页

CPP
1,961
字号
       csimd::pmulhw(mm1, mm0);
       csimd::movq(mm7, (out + offset + 0 * 8));
       csimd::pmulhw(mm4, mm6);
       csimd::movq(mm5, (out + offset + 6 * 8));
       csimd::movq(mm3, mm7);
       csimd::movq((fdct_tg_all_16 + 8), mm5);
       csimd::psubsw(mm2, mm7);
       csimd::paddsw(mm2, mm3);
       csimd::pmulhw(mm7, mm5);
       csimd::paddsw(mm3, mm0);
       csimd::paddsw(mm4, mm6);
       csimd::pmulhw((fdct_tg_all_16 + 0), mm3);
       csimd::por(fdct_one_corr, mm0);
       csimd::paddsw(mm7, mm5);
       csimd::psubsw(mm6, mm7);
       csimd::movq(mm0, (out + offset + 1 * 8));
       csimd::paddsw(mm4, mm5);
       csimd::movq(mm7, (out + offset + 3 * 8));
       csimd::psubsw(mm1, mm3);
       csimd::movq(mm5, (out + offset + 5 * 8));
       csimd::movq(mm3, (out + offset + 7 * 8));
    }
  };
/*
 struct TrowMMX2
  {
   static __forceinline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
    {
       __m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
       mm5=_mm_shuffle_pi16(*(__m64*)(in+4),0x1b);// pshufw((in + 4), mm5, 0x1B);
       csimd::movq((in + 0), mm0);
       csimd::movq(mm0, mm1);
       csimd::paddsw(mm5, mm0);
       csimd::psubsw(mm5, mm1);
       csimd::movq(mm0, mm2);
       csimd::punpckldq(mm1, mm0);
       csimd::punpckhdq(mm1, mm2);
       csimd::movq((table + 0), mm1);
       csimd::movq((table + 4), mm3);
       csimd::movq((table + 8), mm4);
       csimd::movq((table + 12), mm5);
       csimd::movq((table + 16), mm6);
       csimd::movq((table + 20), mm7);
       csimd::pmaddwd(mm0, mm1);
       csimd::pmaddwd(mm2, mm3);
       csimd::pmaddwd(mm0, mm4);
       csimd::pmaddwd(mm2, mm5);
       csimd::pmaddwd(mm0, mm6);
       csimd::pmaddwd(mm2, mm7);
       csimd::pmaddwd((table + 24), mm0);
       csimd::pmaddwd((table + 28), mm2);
       csimd::paddd(mm1, mm3);
       csimd::paddd(mm4, mm5);
       csimd::paddd(mm6, mm7);
       csimd::paddd(mm0, mm2);
       csimd::movq(fdct_r_row, mm0);
       csimd::paddd(mm0, mm3);
       csimd::paddd(mm0, mm5);
       csimd::paddd(mm0, mm7);
       csimd::paddd(mm0, mm2);
       csimd::psrad(SHIFT_FRW_ROW, mm3);
       csimd::psrad(SHIFT_FRW_ROW, mm5);
       csimd::psrad(SHIFT_FRW_ROW, mm7);
       csimd::psrad(SHIFT_FRW_ROW, mm2);
       csimd::packssdw(mm5, mm3);
       csimd::packssdw(mm2, mm7);
       csimd::movq(mm3, (out + 0));
       csimd::movq(mm7, (out + 4));
    }
  };
*/
 struct TrowMMX
  {
   static __forceinline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
    {
       __m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
       csimd::movd((in + 6), mm1);
       csimd::punpcklwd((in + 4), mm1);
       csimd::movq(mm1, mm2);
       csimd::psrlq(0x20, mm1);
       csimd::movq((in + 0), mm0);
       csimd::punpcklwd(mm2, mm1);
       csimd::movq(mm0, mm5);
       csimd::paddsw(mm1, mm0);
       csimd::psubsw(mm1, mm5);
       csimd::movq(mm0, mm2);
       csimd::punpckldq(mm5, mm0);
       csimd::punpckhdq(mm5, mm2);
       csimd::movq((table + 0), mm1);
       csimd::movq((table + 4), mm3);
       csimd::movq((table + 8), mm4);
       csimd::movq((table + 12), mm5);
       csimd::movq((table + 16), mm6);
       csimd::movq((table + 20), mm7);
       csimd::pmaddwd(mm0, mm1);
       csimd::pmaddwd(mm2, mm3);
       csimd::pmaddwd(mm0, mm4);
       csimd::pmaddwd(mm2, mm5);
       csimd::pmaddwd(mm0, mm6);
       csimd::pmaddwd(mm2, mm7);
       csimd::pmaddwd((table + 24), mm0);
       csimd::pmaddwd((table + 28), mm2);
       csimd::paddd(mm1, mm3);
       csimd::paddd(mm4, mm5);
       csimd::paddd(mm6, mm7);
       csimd::paddd(mm0, mm2);
       csimd::movq(fdct_r_row, mm0);
       csimd::paddd(mm0, mm3);
       csimd::paddd(mm0, mm5);
       csimd::paddd(mm0, mm7);
       csimd::paddd(mm0, mm2);
       csimd::psrad(SHIFT_FRW_ROW, mm3);
       csimd::psrad(SHIFT_FRW_ROW, mm5);
       csimd::psrad(SHIFT_FRW_ROW, mm7);
       csimd::psrad(SHIFT_FRW_ROW, mm2);
       csimd::packssdw(mm5, mm3);
       csimd::packssdw(mm2, mm7);
       csimd::movq(mm3, (out + 0));
       csimd::movq(mm7, (out + 4));
   }
  };

    __align8(int64_t,align_tmp[16]);
    int16_t * const block_tmp= (int16_t*)align_tmp;
    int16_t *block1, *out;
    const int16_t *table;
    int i;

    block1 = block_tmp;
    Tcol::fdct_col(block, block1, 0);
    Tcol::fdct_col(block, block1, 4);

    block1 = block_tmp;
    table = tab_frw_01234567;
    out = block;
    for(i=8;i>0;i--) {
        TrowMMX::fdct_row(block1, out, table);
        block1 += 8;
        table += 32;
        out += 8;
    }
}

TimgFilterPostprocSpp::TimgFilterPostprocSpp(IffdshowBase *Ideci,Tfilters *Iparent):
 TimgFilterPostprocBase(Ideci,Iparent,false),
 temp(NULL),src(NULL),
 old_sppMode(-1)
{
}
void TimgFilterPostprocSpp::done(void)
{
 if (temp) aligned_free(temp);temp=NULL;
 if (src) aligned_free(src);src=NULL;
}
void TimgFilterPostprocSpp::onSizeChange(void)
{
 done();
}

template<> __align16(const uint8_t,TimgFilterPostprocSpp::TstoreSlice<Tmmx>::dither[8][8])=
{
 {  0,  48,  12,  60,   3,  51,  15,  63, },
 { 32,  16,  44,  28,  35,  19,  47,  31, },
 {  8,  56,   4,  52,  11,  59,   7,  55, },
 { 40,  24,  36,  20,  43,  27,  39,  23, },
 {  2,  50,  14,  62,   1,  49,  13,  61, },
 { 34,  18,  46,  30,  33,  17,  45,  29, },
 { 10,  58,   6,  54,   9,  57,   5,  53, },
 { 42,  26,  38,  22,  41,  25,  37,  21, },
};
#ifdef __SSE2__
template<> __align16(const uint8_t,TimgFilterPostprocSpp::TstoreSlice<Tsse2>::dither[8][16])=
{
 {  0,  48,  12,  60,   3,  51,  15,  63,   0,  48,  12,  60,   3,  51,  15,  63 },
 { 32,  16,  44,  28,  35,  19,  47,  31,  32,  16,  44,  28,  35,  19,  47,  31 },
 {  8,  56,   4,  52,  11,  59,   7,  55,   8,  56,   4,  52,  11,  59,   7,  55 },
 { 40,  24,  36,  20,  43,  27,  39,  23,  40,  24,  36,  20,  43,  27,  39,  23 },
 {  2,  50,  14,  62,   1,  49,  13,  61,   2,  50,  14,  62,   1,  49,  13,  61 },
 { 34,  18,  46,  30,  33,  17,  45,  29,  34,  18,  46,  30,  33,  17,  45,  29 },
 { 10,  58,   6,  54,   9,  57,   5,  53,  10,  58,   6,  54,   9,  57,   5,  53 },
 { 42,  26,  38,  22,  41,  25,  37,  21,  42,  26,  38,  22,  41,  25,  37,  21 },
};
#endif

void TimgFilterPostprocSpp::store_slice_c(uint8_t *dst, const int16_t *src, int dst_stride, int src_stride, unsigned int width, unsigned int height, int log2_scale)
{
 for (unsigned int y=0;y<height;y++)
  {
   const uint8_t *d=TstoreSlice<Tmmx>::dither[y];
   for (unsigned int x=0;x<width;x+=8)
    {
     store(0,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(1,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(2,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(3,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(4,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(5,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(6,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
     store(7,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
    }
  }
}
template<class _mm> void TimgFilterPostprocSpp::TstoreSlice<_mm>::store_slice(uint8_t *dst0, const int16_t *src0, int dst_stride, int src_stride, unsigned int width, unsigned int height, int log2_scale)
{
 const int16_t *src=src0;unsigned char *dst=dst0;
 if (_mm::align && (intptr_t(src)&15 || intptr_t(dst)&15 || src_stride&15 || dst_stride&15))
  {
   TstoreSlice<Tmmx>::store_slice(dst,src,dst_stride,src_stride,width,height,log2_scale);
   return;
  }
 unsigned int mmxw=width&~(_mm::size-1);
 for(unsigned int y=0; y<height; y++)
  {
   typename _mm::__m mm3,mm4,mm0,mm1;
   movq (mm3,dither[y]);
   movq (mm4,dither[y]);
   pxor (mm0, mm0);
   punpcklbw (mm3,mm0);
   punpckhbw (mm4,mm0);
   psraw (mm3,log2_scale);
   psraw (mm4,log2_scale);
   uint8_t *dst1=dst;
   const uint8_t *src1=(const uint8_t*)src;
   const uint8_t *dst1end=dst+width-_mm::size+1;
   for (;dst1<dst1end;src1+=_mm::size*2,dst1+=_mm::size)
    {
     movq (mm0,src1);
     movq (mm1,src1+_mm::size);
     paddw (mm0,mm3);
     paddw (mm1,mm4);
     psraw (mm0,6-log2_scale);
     psraw (mm1,6-log2_scale);
     packuswb(mm0,mm1);
     movq (dst1,mm0);
    }
   src+=src_stride;
   dst+=dst_stride;
  }
 if(width != mmxw)
  store_slice_c(dst0 + mmxw, src0 + mmxw, dst_stride, src_stride, width - mmxw, height,log2_scale);
}
__forceinline void TimgFilterPostprocSpp::requant_core_soft(unsigned char *dst0,unsigned char *dst1,unsigned char *dst2,unsigned char *dst3,const unsigned char *src0,const unsigned char *src1,const unsigned char *src2,const unsigned char *src3,const __m64 &mm4,const __m64 &mm5)
{
 __m64 mm0,mm1,mm6,mm7,mm2,mm3;
 movq  (mm0,src0);
 movq  (mm1,src1);
 pxor    (mm6,mm6 );
 pxor    (mm7,mm7 );
 pcmpgtw (mm6,mm0 );
 pcmpgtw (mm7,mm1 );
 pxor    (mm0,mm6 );
 pxor    (mm1,mm7 );
 psubusw (mm0,mm4 );
 psubusw (mm1,mm4 );
 pxor    (mm0,mm6 );
 pxor    (mm1,mm7 );
 movq    (mm2,src2);
 movq    (mm3,src3);
 pxor    (mm6,mm6);
 pxor    (mm7,mm7);
 pcmpgtw (mm6,mm2);
 pcmpgtw (mm7,mm3);
 pxor    (mm2,mm6);
 pxor    (mm3,mm7);
 psubusw (mm2,mm4);
 psubusw (mm3,mm4);
 pxor    (mm2,mm6);
 pxor    (mm3,mm7);

 paddsw (mm0,mm5 );
 paddsw (mm1,mm5 );
 paddsw (mm2,mm5 );
 paddsw (mm3,mm5 );
 psraw  (mm0, 3  );
 psraw  (mm1, 3  );
 psraw  (mm2, 3  );
 psraw  (mm3, 3  );

 movq      (mm7,mm0);
 punpcklwd (mm0,mm2);
 punpckhwd (mm7,mm2);
 movq      (mm2,mm1);
 punpcklwd (mm1,mm3);
 punpckhwd (mm2,mm3);
 movq      (mm3,mm0);
 punpcklwd (mm0,mm1);
 punpckhwd (mm3,mm7);
 punpcklwd (mm7,mm2);
 punpckhwd (mm1,mm2);

 movq (dst0,mm0);
 movq (dst1,mm7);
 movq (dst2,mm3);
 movq (dst3,mm1);
}
void TimgFilterPostprocSpp::softthresh_mmx(DCTELEM dst0[64], const DCTELEM src0[64], int qp)
{
 int bias= 0; //FIXME
 unsigned int threshold1;
 threshold1= qp*((1<<4) - bias) - 1;

 __m64 mm4,mm5;
 movd (mm4,threshold1);
 movd (mm5,4);
 packssdw( mm4, mm4);
 packssdw( mm5, mm5);
 packssdw( mm4, mm4);
 packssdw( mm5, mm5);
 const unsigned char *src=(const unsigned char*)src0;
 unsigned char *dst=(unsigned char*)dst0;
 requant_core_soft(   dst,  8+dst, 16+dst, 24+dst,   src, 8+src, 64+src, 72+src,mm4,mm5);
 requant_core_soft(32+dst, 40+dst, 48+dst, 56+dst,16+src,24+src, 48+src, 56+src,mm4,mm5);
 requant_core_soft(64+dst, 72+dst, 80+dst, 88+dst,32+src,40+src, 96+src,104+src,mm4,mm5);
 requant_core_soft(96+dst,104+dst,112+dst,120+dst,80+src,88+src,112+src,120+src,mm4,mm5);
 dst0[0]=DCTELEM((src0[0]+4)>>3);
}
__forceinline void TimgFilterPostprocSpp::requant_core_hard(unsigned char *dst0,unsigned char *dst1,unsigned char *dst2,unsigned char *dst3,const unsigned char *src0,const unsigned char *src1,const unsigned char *src2,const unsigned char *src3,const __m64 &mm4,const __m64 &mm5,const __m64 &mm6)
{
 __m64 mm0,mm1,mm2,mm3;
 movq( mm0,src0);
 movq( mm1,src1);
 movq( mm2,src2);
 movq( mm3,src3);

 psubw( mm0, mm4);
 psubw( mm1, mm4);
 psubw( mm2, mm4);
 psubw( mm3, mm4);

 paddusw( mm0,mm5);
 paddusw( mm1,mm5);
 paddusw( mm2,mm5);
 paddusw( mm3,mm5);

 paddw( mm0,mm6);
 paddw( mm1,mm6);
 paddw( mm2,mm6);
 paddw( mm3,mm6);

 psubusw( mm0,mm6);
 psubusw( mm1,mm6);
 psubusw( mm2,mm6);
 psubusw( mm3,mm6);

 psraw( mm0,3);
 psraw( mm1,3);
 psraw( mm2,3);
 psraw( mm3,3);
 __m64 mm7;
 movq      (mm7,mm0);
 punpcklwd (mm0,mm2);
 punpckhwd (mm7,mm2);
 movq      (mm2,mm1);
 punpcklwd (mm1,mm3);
 punpckhwd (mm2,mm3);
 movq      (mm3,mm0);
 punpcklwd (mm0,mm1);
 punpckhwd (mm3,mm7);
 punpcklwd (mm7,mm2);
 punpckhwd (mm1,mm2);

 movq (dst0,mm0);
 movq (dst1,mm7);
 movq (dst2,mm3);
 movq (dst3,mm1);
}

void TimgFilterPostprocSpp::hardthresh_mmx(DCTELEM dst0[64], const DCTELEM src0[64], int qp)
{
 int bias= 0; //FIXME
 unsigned int threshold1;

 threshold1= qp*((1<<4) - bias) - 1;

 __m64 mm4,mm5,mm6;
 movd( mm4,threshold1+1);
 movd( mm5,threshold1+5);
 movd( mm6,threshold1-4);
 packssdw( mm4,mm4);
 packssdw( mm5,mm5);
 packssdw( mm6,mm6);
 packssdw( mm4,mm4);
 packssdw( mm5,mm5);
 packssdw( mm6,mm6);
 const unsigned char *src=(const unsigned char*)src0;
 unsigned char *dst=(unsigned char*)dst0;
 requant_core_hard(   dst,  8+dst, 16+dst, 24+dst,   src, 8+src, 64+src, 72+src,mm4,mm5,mm6);
 requant_core_hard(32+dst, 40+dst, 48+dst, 56+dst,16+src,24+src, 48+src, 56+src,mm4,mm5,mm6);
 requant_core_hard(64+dst, 72+dst, 80+dst, 88+dst,32+src,40+src, 96+src,104+src,mm4,mm5,mm6);
 requant_core_hard(96+dst,104+dst,112+dst,120+dst,80+src,88+src,112+src,120+src,mm4,mm5,mm6);

 dst0[0]=DCTELEM((src0[0]+4)>>3);
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?