timgfilterpostproc.cpp
来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· C++ 代码 · 共 1,961 行 · 第 1/5 页
CPP
1,961 行
csimd::pmulhw(mm1, mm0);
csimd::movq(mm7, (out + offset + 0 * 8));
csimd::pmulhw(mm4, mm6);
csimd::movq(mm5, (out + offset + 6 * 8));
csimd::movq(mm3, mm7);
csimd::movq((fdct_tg_all_16 + 8), mm5);
csimd::psubsw(mm2, mm7);
csimd::paddsw(mm2, mm3);
csimd::pmulhw(mm7, mm5);
csimd::paddsw(mm3, mm0);
csimd::paddsw(mm4, mm6);
csimd::pmulhw((fdct_tg_all_16 + 0), mm3);
csimd::por(fdct_one_corr, mm0);
csimd::paddsw(mm7, mm5);
csimd::psubsw(mm6, mm7);
csimd::movq(mm0, (out + offset + 1 * 8));
csimd::paddsw(mm4, mm5);
csimd::movq(mm7, (out + offset + 3 * 8));
csimd::psubsw(mm1, mm3);
csimd::movq(mm5, (out + offset + 5 * 8));
csimd::movq(mm3, (out + offset + 7 * 8));
}
};
/*
struct TrowMMX2
{
static __forceinline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
{
__m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
mm5=_mm_shuffle_pi16(*(__m64*)(in+4),0x1b);// pshufw((in + 4), mm5, 0x1B);
csimd::movq((in + 0), mm0);
csimd::movq(mm0, mm1);
csimd::paddsw(mm5, mm0);
csimd::psubsw(mm5, mm1);
csimd::movq(mm0, mm2);
csimd::punpckldq(mm1, mm0);
csimd::punpckhdq(mm1, mm2);
csimd::movq((table + 0), mm1);
csimd::movq((table + 4), mm3);
csimd::movq((table + 8), mm4);
csimd::movq((table + 12), mm5);
csimd::movq((table + 16), mm6);
csimd::movq((table + 20), mm7);
csimd::pmaddwd(mm0, mm1);
csimd::pmaddwd(mm2, mm3);
csimd::pmaddwd(mm0, mm4);
csimd::pmaddwd(mm2, mm5);
csimd::pmaddwd(mm0, mm6);
csimd::pmaddwd(mm2, mm7);
csimd::pmaddwd((table + 24), mm0);
csimd::pmaddwd((table + 28), mm2);
csimd::paddd(mm1, mm3);
csimd::paddd(mm4, mm5);
csimd::paddd(mm6, mm7);
csimd::paddd(mm0, mm2);
csimd::movq(fdct_r_row, mm0);
csimd::paddd(mm0, mm3);
csimd::paddd(mm0, mm5);
csimd::paddd(mm0, mm7);
csimd::paddd(mm0, mm2);
csimd::psrad(SHIFT_FRW_ROW, mm3);
csimd::psrad(SHIFT_FRW_ROW, mm5);
csimd::psrad(SHIFT_FRW_ROW, mm7);
csimd::psrad(SHIFT_FRW_ROW, mm2);
csimd::packssdw(mm5, mm3);
csimd::packssdw(mm2, mm7);
csimd::movq(mm3, (out + 0));
csimd::movq(mm7, (out + 4));
}
};
*/
struct TrowMMX
{
static __forceinline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
{
__m64 mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7;
csimd::movd((in + 6), mm1);
csimd::punpcklwd((in + 4), mm1);
csimd::movq(mm1, mm2);
csimd::psrlq(0x20, mm1);
csimd::movq((in + 0), mm0);
csimd::punpcklwd(mm2, mm1);
csimd::movq(mm0, mm5);
csimd::paddsw(mm1, mm0);
csimd::psubsw(mm1, mm5);
csimd::movq(mm0, mm2);
csimd::punpckldq(mm5, mm0);
csimd::punpckhdq(mm5, mm2);
csimd::movq((table + 0), mm1);
csimd::movq((table + 4), mm3);
csimd::movq((table + 8), mm4);
csimd::movq((table + 12), mm5);
csimd::movq((table + 16), mm6);
csimd::movq((table + 20), mm7);
csimd::pmaddwd(mm0, mm1);
csimd::pmaddwd(mm2, mm3);
csimd::pmaddwd(mm0, mm4);
csimd::pmaddwd(mm2, mm5);
csimd::pmaddwd(mm0, mm6);
csimd::pmaddwd(mm2, mm7);
csimd::pmaddwd((table + 24), mm0);
csimd::pmaddwd((table + 28), mm2);
csimd::paddd(mm1, mm3);
csimd::paddd(mm4, mm5);
csimd::paddd(mm6, mm7);
csimd::paddd(mm0, mm2);
csimd::movq(fdct_r_row, mm0);
csimd::paddd(mm0, mm3);
csimd::paddd(mm0, mm5);
csimd::paddd(mm0, mm7);
csimd::paddd(mm0, mm2);
csimd::psrad(SHIFT_FRW_ROW, mm3);
csimd::psrad(SHIFT_FRW_ROW, mm5);
csimd::psrad(SHIFT_FRW_ROW, mm7);
csimd::psrad(SHIFT_FRW_ROW, mm2);
csimd::packssdw(mm5, mm3);
csimd::packssdw(mm2, mm7);
csimd::movq(mm3, (out + 0));
csimd::movq(mm7, (out + 4));
}
};
__align8(int64_t,align_tmp[16]);
int16_t * const block_tmp= (int16_t*)align_tmp;
int16_t *block1, *out;
const int16_t *table;
int i;
block1 = block_tmp;
Tcol::fdct_col(block, block1, 0);
Tcol::fdct_col(block, block1, 4);
block1 = block_tmp;
table = tab_frw_01234567;
out = block;
for(i=8;i>0;i--) {
TrowMMX::fdct_row(block1, out, table);
block1 += 8;
table += 32;
out += 8;
}
}
TimgFilterPostprocSpp::TimgFilterPostprocSpp(IffdshowBase *Ideci,Tfilters *Iparent):
TimgFilterPostprocBase(Ideci,Iparent,false),
temp(NULL),src(NULL),
old_sppMode(-1)
{
}
void TimgFilterPostprocSpp::done(void)
{
if (temp) aligned_free(temp);temp=NULL;
if (src) aligned_free(src);src=NULL;
}
void TimgFilterPostprocSpp::onSizeChange(void)
{
done();
}
template<> __align16(const uint8_t,TimgFilterPostprocSpp::TstoreSlice<Tmmx>::dither[8][8])=
{
{ 0, 48, 12, 60, 3, 51, 15, 63, },
{ 32, 16, 44, 28, 35, 19, 47, 31, },
{ 8, 56, 4, 52, 11, 59, 7, 55, },
{ 40, 24, 36, 20, 43, 27, 39, 23, },
{ 2, 50, 14, 62, 1, 49, 13, 61, },
{ 34, 18, 46, 30, 33, 17, 45, 29, },
{ 10, 58, 6, 54, 9, 57, 5, 53, },
{ 42, 26, 38, 22, 41, 25, 37, 21, },
};
#ifdef __SSE2__
template<> __align16(const uint8_t,TimgFilterPostprocSpp::TstoreSlice<Tsse2>::dither[8][16])=
{
{ 0, 48, 12, 60, 3, 51, 15, 63, 0, 48, 12, 60, 3, 51, 15, 63 },
{ 32, 16, 44, 28, 35, 19, 47, 31, 32, 16, 44, 28, 35, 19, 47, 31 },
{ 8, 56, 4, 52, 11, 59, 7, 55, 8, 56, 4, 52, 11, 59, 7, 55 },
{ 40, 24, 36, 20, 43, 27, 39, 23, 40, 24, 36, 20, 43, 27, 39, 23 },
{ 2, 50, 14, 62, 1, 49, 13, 61, 2, 50, 14, 62, 1, 49, 13, 61 },
{ 34, 18, 46, 30, 33, 17, 45, 29, 34, 18, 46, 30, 33, 17, 45, 29 },
{ 10, 58, 6, 54, 9, 57, 5, 53, 10, 58, 6, 54, 9, 57, 5, 53 },
{ 42, 26, 38, 22, 41, 25, 37, 21, 42, 26, 38, 22, 41, 25, 37, 21 },
};
#endif
void TimgFilterPostprocSpp::store_slice_c(uint8_t *dst, const int16_t *src, int dst_stride, int src_stride, unsigned int width, unsigned int height, int log2_scale)
{
for (unsigned int y=0;y<height;y++)
{
const uint8_t *d=TstoreSlice<Tmmx>::dither[y];
for (unsigned int x=0;x<width;x+=8)
{
store(0,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(1,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(2,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(3,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(4,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(5,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(6,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
store(7,src,x,y,dst_stride,src_stride,log2_scale,d,dst);
}
}
}
template<class _mm> void TimgFilterPostprocSpp::TstoreSlice<_mm>::store_slice(uint8_t *dst0, const int16_t *src0, int dst_stride, int src_stride, unsigned int width, unsigned int height, int log2_scale)
{
const int16_t *src=src0;unsigned char *dst=dst0;
if (_mm::align && (intptr_t(src)&15 || intptr_t(dst)&15 || src_stride&15 || dst_stride&15))
{
TstoreSlice<Tmmx>::store_slice(dst,src,dst_stride,src_stride,width,height,log2_scale);
return;
}
unsigned int mmxw=width&~(_mm::size-1);
for(unsigned int y=0; y<height; y++)
{
typename _mm::__m mm3,mm4,mm0,mm1;
movq (mm3,dither[y]);
movq (mm4,dither[y]);
pxor (mm0, mm0);
punpcklbw (mm3,mm0);
punpckhbw (mm4,mm0);
psraw (mm3,log2_scale);
psraw (mm4,log2_scale);
uint8_t *dst1=dst;
const uint8_t *src1=(const uint8_t*)src;
const uint8_t *dst1end=dst+width-_mm::size+1;
for (;dst1<dst1end;src1+=_mm::size*2,dst1+=_mm::size)
{
movq (mm0,src1);
movq (mm1,src1+_mm::size);
paddw (mm0,mm3);
paddw (mm1,mm4);
psraw (mm0,6-log2_scale);
psraw (mm1,6-log2_scale);
packuswb(mm0,mm1);
movq (dst1,mm0);
}
src+=src_stride;
dst+=dst_stride;
}
if(width != mmxw)
store_slice_c(dst0 + mmxw, src0 + mmxw, dst_stride, src_stride, width - mmxw, height,log2_scale);
}
__forceinline void TimgFilterPostprocSpp::requant_core_soft(unsigned char *dst0,unsigned char *dst1,unsigned char *dst2,unsigned char *dst3,const unsigned char *src0,const unsigned char *src1,const unsigned char *src2,const unsigned char *src3,const __m64 &mm4,const __m64 &mm5)
{
__m64 mm0,mm1,mm6,mm7,mm2,mm3;
movq (mm0,src0);
movq (mm1,src1);
pxor (mm6,mm6 );
pxor (mm7,mm7 );
pcmpgtw (mm6,mm0 );
pcmpgtw (mm7,mm1 );
pxor (mm0,mm6 );
pxor (mm1,mm7 );
psubusw (mm0,mm4 );
psubusw (mm1,mm4 );
pxor (mm0,mm6 );
pxor (mm1,mm7 );
movq (mm2,src2);
movq (mm3,src3);
pxor (mm6,mm6);
pxor (mm7,mm7);
pcmpgtw (mm6,mm2);
pcmpgtw (mm7,mm3);
pxor (mm2,mm6);
pxor (mm3,mm7);
psubusw (mm2,mm4);
psubusw (mm3,mm4);
pxor (mm2,mm6);
pxor (mm3,mm7);
paddsw (mm0,mm5 );
paddsw (mm1,mm5 );
paddsw (mm2,mm5 );
paddsw (mm3,mm5 );
psraw (mm0, 3 );
psraw (mm1, 3 );
psraw (mm2, 3 );
psraw (mm3, 3 );
movq (mm7,mm0);
punpcklwd (mm0,mm2);
punpckhwd (mm7,mm2);
movq (mm2,mm1);
punpcklwd (mm1,mm3);
punpckhwd (mm2,mm3);
movq (mm3,mm0);
punpcklwd (mm0,mm1);
punpckhwd (mm3,mm7);
punpcklwd (mm7,mm2);
punpckhwd (mm1,mm2);
movq (dst0,mm0);
movq (dst1,mm7);
movq (dst2,mm3);
movq (dst3,mm1);
}
void TimgFilterPostprocSpp::softthresh_mmx(DCTELEM dst0[64], const DCTELEM src0[64], int qp)
{
int bias= 0; //FIXME
unsigned int threshold1;
threshold1= qp*((1<<4) - bias) - 1;
__m64 mm4,mm5;
movd (mm4,threshold1);
movd (mm5,4);
packssdw( mm4, mm4);
packssdw( mm5, mm5);
packssdw( mm4, mm4);
packssdw( mm5, mm5);
const unsigned char *src=(const unsigned char*)src0;
unsigned char *dst=(unsigned char*)dst0;
requant_core_soft( dst, 8+dst, 16+dst, 24+dst, src, 8+src, 64+src, 72+src,mm4,mm5);
requant_core_soft(32+dst, 40+dst, 48+dst, 56+dst,16+src,24+src, 48+src, 56+src,mm4,mm5);
requant_core_soft(64+dst, 72+dst, 80+dst, 88+dst,32+src,40+src, 96+src,104+src,mm4,mm5);
requant_core_soft(96+dst,104+dst,112+dst,120+dst,80+src,88+src,112+src,120+src,mm4,mm5);
dst0[0]=DCTELEM((src0[0]+4)>>3);
}
__forceinline void TimgFilterPostprocSpp::requant_core_hard(unsigned char *dst0,unsigned char *dst1,unsigned char *dst2,unsigned char *dst3,const unsigned char *src0,const unsigned char *src1,const unsigned char *src2,const unsigned char *src3,const __m64 &mm4,const __m64 &mm5,const __m64 &mm6)
{
__m64 mm0,mm1,mm2,mm3;
movq( mm0,src0);
movq( mm1,src1);
movq( mm2,src2);
movq( mm3,src3);
psubw( mm0, mm4);
psubw( mm1, mm4);
psubw( mm2, mm4);
psubw( mm3, mm4);
paddusw( mm0,mm5);
paddusw( mm1,mm5);
paddusw( mm2,mm5);
paddusw( mm3,mm5);
paddw( mm0,mm6);
paddw( mm1,mm6);
paddw( mm2,mm6);
paddw( mm3,mm6);
psubusw( mm0,mm6);
psubusw( mm1,mm6);
psubusw( mm2,mm6);
psubusw( mm3,mm6);
psraw( mm0,3);
psraw( mm1,3);
psraw( mm2,3);
psraw( mm3,3);
__m64 mm7;
movq (mm7,mm0);
punpcklwd (mm0,mm2);
punpckhwd (mm7,mm2);
movq (mm2,mm1);
punpcklwd (mm1,mm3);
punpckhwd (mm2,mm3);
movq (mm3,mm0);
punpcklwd (mm0,mm1);
punpckhwd (mm3,mm7);
punpcklwd (mm7,mm2);
punpckhwd (mm1,mm2);
movq (dst0,mm0);
movq (dst1,mm7);
movq (dst2,mm3);
movq (dst3,mm1);
}
void TimgFilterPostprocSpp::hardthresh_mmx(DCTELEM dst0[64], const DCTELEM src0[64], int qp)
{
int bias= 0; //FIXME
unsigned int threshold1;
threshold1= qp*((1<<4) - bias) - 1;
__m64 mm4,mm5,mm6;
movd( mm4,threshold1+1);
movd( mm5,threshold1+5);
movd( mm6,threshold1-4);
packssdw( mm4,mm4);
packssdw( mm5,mm5);
packssdw( mm6,mm6);
packssdw( mm4,mm4);
packssdw( mm5,mm5);
packssdw( mm6,mm6);
const unsigned char *src=(const unsigned char*)src0;
unsigned char *dst=(unsigned char*)dst0;
requant_core_hard( dst, 8+dst, 16+dst, 24+dst, src, 8+src, 64+src, 72+src,mm4,mm5,mm6);
requant_core_hard(32+dst, 40+dst, 48+dst, 56+dst,16+src,24+src, 48+src, 56+src,mm4,mm5,mm6);
requant_core_hard(64+dst, 72+dst, 80+dst, 88+dst,32+src,40+src, 96+src,104+src,mm4,mm5,mm6);
requant_core_hard(96+dst,104+dst,112+dst,120+dst,80+src,88+src,112+src,120+src,mm4,mm5,mm6);
dst0[0]=DCTELEM((src0[0]+4)>>3);
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?