📄 timgfilterdct.cpp
字号:
movdqa (xmm1, src+1*16); // In1
psubsw (xmm1, xmm6); // t6 = In1-In6
paddsw (xmm6, src+1*16); // t1 = In1+In6
psubsw (xmm7, xmm4); // tm03 = t0-t3
psubsw (xmm6, xmm5); // tm12 = t1-t2
paddsw (xmm4, xmm4); // 2.t3
paddsw (xmm5, xmm5); // 2.t2
paddsw (xmm4, xmm7); // tp03 = t0+t3
paddsw (xmm5, xmm6); // tp12 = t1+t2
psllw (xmm2, shift+1); // shift t5 (shift +1 to..
psllw (xmm1, shift+1); // shift t6 ..compensate cos4/2)
psllw (xmm4, shift); // shift t3
psllw (xmm5, shift); // shift t2
psllw (xmm7, shift); // shift t0
psllw (xmm6, shift); // shift t1
psllw (xmm3, shift); // shift t4
psllw (xmm0, shift); // shift t7
psubsw (xmm4, xmm5); // out4 = tp03-tp12
psubsw (xmm1, xmm2); // xmm1: t6-t5
paddsw (xmm5, xmm5);
paddsw (xmm2, xmm2);
paddsw (xmm5, xmm4); // out0 = tp03+tp12
movdqa (src+4*16, xmm4); // => out4
paddsw (xmm2, xmm1 ); // xmm2: t6+t5
movdqa (src+0*16, xmm5); // => out0
const __m128i tan1=_mm_set1_epi16(0x32ec); // tan( pi/16))
const __m128i tan2=_mm_set1_epi16(0x6a0a); // tan(2pi/16) (=sqrt(2)-1)
const __m128i tan3=_mm_set1_epi16(0xab0e); // tan(3pi/16)-1
const __m128i sqrt2=_mm_set1_epi16(0x5a82); // 0.5/sqrt(2)
const __m128i Rounder1=_mm_set_epi16(1,1,1,1, 1,1,1,1);
movdqa (xmm4, tan2); // xmm4 <= tan2
pmulhw (xmm4, xmm7); // tm03*tan2
movdqa (xmm5, tan2); // xmm5 <= tan2
psubsw (xmm4, xmm6); // out6 = tm03*tan2 - tm12
pmulhw (xmm5, xmm6); // tm12*tan2
paddsw (xmm5, xmm7); // out2 = tm12*tan2 + tm03
movdqa (xmm6, sqrt2);
movdqa (xmm7, Rounder1);
pmulhw (xmm2, xmm6); // xmm2: tp65 = (t6 + t5)*cos4
por (xmm5, xmm7); // correct out2
por (xmm4, xmm7); // correct out6
pmulhw (xmm1, xmm6); // xmm1: tm65 = (t6 - t5)*cos4
por (xmm2, xmm7); // correct tp65
movdqa (src+2*16, xmm5); // => out2
movdqa (xmm5, xmm3 ); // save t4
movdqa (src+6*16, xmm4); // => out6
movdqa (xmm4, xmm0 ); // save t7
psubsw (xmm3, xmm1); // xmm3: tm465 = t4 - tm65
psubsw (xmm0, xmm2); // xmm0: tm765 = t7 - tp65
paddsw (xmm2, xmm4); // xmm2: tp765 = t7 + tp65
paddsw (xmm1, xmm5); // xmm1: tp465 = t4 + tm65
movdqa (xmm4, tan3); // tan3 - 1
movdqa (xmm5, tan1); // tan1
movdqa (xmm7, xmm3); // save tm465
pmulhw (xmm3, xmm4); // tm465*(tan3-1)
movdqa (xmm6, xmm1); // save tp465
pmulhw (xmm1, xmm5); // tp465*tan1
paddsw (xmm3, xmm7); // tm465*tan3
pmulhw (xmm4, xmm0); // tm765*(tan3-1)
paddsw (xmm4, xmm0); // tm765*tan3
pmulhw (xmm5, xmm2); // tp765*tan1
paddsw (xmm1, xmm2); // out1 = tp765 + tp465*tan1
psubsw (xmm0, xmm3); // out3 = tm765 - tm465*tan3
paddsw (xmm7, xmm4); // out5 = tm465 + tm765*tan3
psubsw (xmm5, xmm6); // out7 =-tp465 + tp765*tan1
movdqa (src+1*16, xmm1); // => out1
movdqa (src+3*16, xmm0); // => out3
movdqa (src+5*16, xmm7); // => out5
movdqa (src+7*16, xmm5); // => out7
}
static __forceinline void fMTX_MULT(unsigned char *ecx,int src,const unsigned char *Coeffs,const unsigned char *rounders,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3)
{
movdqa ( xmm0, ecx+src*16+0); // xmm0 = 01234567
xmm1=_mm_shufflehi_epi16(xmm0,0x1b);// pshufhw ( xmm1, xmm0, 00011011b); // xmm1 = ----7654
xmm0=_mm_shuffle_epi32(xmm0,0x44); //pshufd ( xmm0, xmm0, 01000100b);
xmm1=_mm_shuffle_epi32(xmm1,0xee); //pshufd ( xmm1, xmm1, 11101110b);
movdqa ( xmm2, xmm0);
paddsw (xmm0, xmm1); // xmm0 = a0 a1 a2 a3
psubsw (xmm2, xmm1); // xmm2 = b0 b1 b2 b3
punpckldq (xmm0, xmm2); // xmm0 = a0 a1 b0 b1a2 a3 b2 b3
xmm2=_mm_shuffle_epi32(xmm0,0x4e);//pshufd (xmm2, xmm0, 01001110b); // xmm2 = a2 a3 b2 b3a0 a1 b0 b1
// M00 M01 M16 M17 M06 M07 M22 M23 x mm0 = 0 /1 /2'/3'
// M02 M03 M18 M19 M04 M05 M20 M21 x mm2 = 0'/1'/2 /3
// M08 M09 M24 M25 M14 M15 M30 M31 x mm0 = 4 /5 /6'/7'
// M10 M11 M26 M27 M12 M13 M28 M29 x mm2 = 4'/5'/6 /7
movdqa (xmm1, Coeffs+16);
movdqa (xmm3, Coeffs+32);
pmaddwd (xmm1, xmm2);
pmaddwd (xmm3, xmm0);
pmaddwd (xmm2, Coeffs+48);
pmaddwd (xmm0, Coeffs+ 0);
paddd (xmm0, xmm1); // out0 | out1 out2 | out3
paddd (xmm2, xmm3); // out4 | out5 out6 | out7
psrad (xmm0, 16 );
psrad (xmm2, 16 );
packssdw (xmm0, xmm2); // out0 .. out7
paddsw (xmm0, rounders); // Round
psraw (xmm0, 4); // => -2048, 2047
movdqa (ecx+src*16+0, xmm0);
}
};
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
unsigned char *ecx=(unsigned char*)block;
Tfdct::fLLM_PASS(ecx,3,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);
Tfdct::fMTX_MULT(ecx,0, (const unsigned char*)fTab1, (const unsigned char*)Fdct_Rnd0,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,1, (const unsigned char*)fTab2, (const unsigned char*)Fdct_Rnd2,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,2, (const unsigned char*)fTab3, (const unsigned char*)Fdct_Rnd1,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,3, (const unsigned char*)fTab4, (const unsigned char*)Fdct_Rnd1,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,4, (const unsigned char*)fTab1, (const unsigned char*)Fdct_Rnd0,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,5, (const unsigned char*)fTab4, (const unsigned char*)Fdct_Rnd1,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,6, (const unsigned char*)fTab3, (const unsigned char*)Fdct_Rnd1,xmm0,xmm1,xmm2,xmm3);
Tfdct::fMTX_MULT(ecx,7, (const unsigned char*)fTab2, (const unsigned char*)Fdct_Rnd1,xmm0,xmm1,xmm2,xmm3);
}
void TimgFilterDCT::idct_sse2(short *block)
{
static __align16(const unsigned short,iTab1[])=
{
0x4000, 0x539f, 0x4000, 0x22a3,
0x4000, 0xdd5d, 0x4000, 0xac61,
0x4000, 0x22a3, 0xc000, 0xac61,
0xc000, 0x539f, 0x4000, 0xdd5d,
0x58c5, 0x4b42, 0x4b42, 0xee58,
0x3249, 0xa73b, 0x11a8, 0xcdb7,
0x3249, 0x11a8, 0xa73b, 0xcdb7,
0x11a8, 0x4b42, 0x4b42, 0xa73b
};
static __align16(const unsigned short,iTab2[])=
{
0x58c5, 0x73fc, 0x58c5, 0x300b,
0x58c5, 0xcff5, 0x58c5, 0x8c04,
0x58c5, 0x300b, 0xa73b, 0x8c04,
0xa73b, 0x73fc, 0x58c5, 0xcff5,
0x7b21, 0x6862, 0x6862, 0xe782,
0x45bf, 0x84df, 0x187e, 0xba41,
0x45bf, 0x187e, 0x84df, 0xba41,
0x187e, 0x6862, 0x6862, 0x84df
};
static __align16(const unsigned short,iTab3[])=
{
0x539f, 0x6d41, 0x539f, 0x2d41,
0x539f, 0xd2bf, 0x539f, 0x92bf,
0x539f, 0x2d41, 0xac61, 0x92bf,
0xac61, 0x6d41, 0x539f, 0xd2bf,
0x73fc, 0x6254, 0x6254, 0xe8ee,
0x41b3, 0x8c04, 0x1712, 0xbe4d,
0x41b3, 0x1712, 0x8c04, 0xbe4d,
0x1712, 0x6254, 0x6254, 0x8c04
};
static __align16(const unsigned short,iTab4[])=
{
0x4b42, 0x6254, 0x4b42, 0x28ba,
0x4b42, 0xd746, 0x4b42, 0x9dac,
0x4b42, 0x28ba, 0xb4be, 0x9dac,
0xb4be, 0x6254, 0x4b42, 0xd746,
0x6862, 0x587e, 0x587e, 0xeb3d,
0x3b21, 0x979e, 0x14c3, 0xc4df,
0x3b21, 0x14c3, 0x979e, 0xc4df,
0x14c3, 0x587e, 0x587e, 0x979e
};
static __align16(const uint32_t,Idct_Rnd0[])={65535, 65535, 65535, 65535};
static __align16(const uint32_t,Idct_Rnd1[])={ 3612, 3612, 3612, 3612};
static __align16(const uint32_t,Idct_Rnd2[])={ 2271, 2271, 2271, 2271};
static __align16(const uint32_t,Idct_Rnd3[])={ 1203, 1203, 1203, 1203};
static __align16(const uint32_t,Idct_Rnd4[])={ 1023, 1023, 1023, 1023};
static __align16(const uint32_t,Idct_Rnd5[])={ 102, 102, 102, 102};
static __align16(const uint32_t,Idct_Rnd6[])={ 398, 398, 398, 398};
static __align16(const uint32_t,Idct_Rnd7[])={ 469, 469, 469, 469};
struct Tidct
{
static __forceinline void iMTX_MULT(unsigned char *ecx,int src,const unsigned char *Table,const unsigned char *rounder,int shift,__m128i &xmm0,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
movdqa (xmm0, ecx+src*16); // xmm0 = 01234567
xmm0=_mm_shufflelo_epi16(xmm0,0xd8);// pshuflw xmm0, xmm0, 11011000b // 02134567 // these two shufflings could be
xmm0=_mm_shufflehi_epi16(xmm0,0xd8);// pshufhw xmm0, xmm0, 11011000b // 02134657 // integrated in zig-zag orders
xmm4=_mm_shuffle_epi32(xmm0,0x00);// pshufd xmm4, xmm0, 00000000b // 02020202
xmm5=_mm_shuffle_epi32(xmm0,0xaa);// pshufd xmm5, xmm0, 10101010b // 46464646
xmm6=_mm_shuffle_epi32(xmm0,0x55);// pshufd xmm6, xmm0, 01010101b // 13131313
xmm7=_mm_shuffle_epi32(xmm0,0xff);// pshufd xmm7, xmm0, 11111111b // 57575757
pmaddwd (xmm4, Table+ 0); // dot M00,M01M04,M05M08,M09M12,M13
pmaddwd (xmm5, Table+16); // dot M02,M03M06,M07M10,M11M14,M15
pmaddwd (xmm6, Table+32); // dot M16,M17M20,M21M24,M25M28,M29
pmaddwd (xmm7, Table+48); // dot M18,M19M22,M23M26,M27M30,M31
paddd (xmm4, rounder); // Round
paddd (xmm6, xmm7); // b0|b1|b2|b3
paddd (xmm4, xmm5); // a0|a1|a2|a3
movdqa (xmm7, xmm6);
paddd (xmm6, xmm4); // mm6=a+b
psubd (xmm4, xmm7); // mm4=a-b
psrad (xmm6, shift); // => out 0123
psrad (xmm4, shift); // => out 7654
packssdw (xmm6, xmm4); // 01237654
xmm6=_mm_shufflehi_epi16(xmm6,0x1b);// pshufhw (xmm6, xmm6, 00011011b // 01234567
movdqa (ecx+src*16, xmm6);
}
static __forceinline void iLLM_PASS(unsigned char *src,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
const __m128i tan1=_mm_set1_epi16(0x32ec); // tan( pi/16))
const __m128i tan2=_mm_set1_epi16(0x6a0a); // tan(2pi/16) (=sqrt(2)-1)
const __m128i tan3=_mm_set1_epi16(0xab0e); // tan(3pi/16)-1
const __m128i sqrt2=_mm_set1_epi16(0x5a82); // 0.5/sqrt(2)
movdqa (xmm0, tan3); // t3-1
movdqa (xmm3, src+16*3); // x3
movdqa (xmm1, xmm0 ); // t3-1
movdqa (xmm5, src+16*5); // x5
movdqa (xmm4, tan1 ); // t1
movdqa (xmm6, src+16*1); // x1
movdqa (xmm7, src+16*7); // x7
movdqa (xmm2, xmm4 ); // t1
pmulhw (xmm0, xmm3); // x3*(t3-1)
pmulhw (xmm1, xmm5); // x5*(t3-1)
paddsw (xmm0, xmm3); // x3*t3
paddsw (xmm1, xmm5); // x5*t3
psubsw (xmm0, xmm5); // x3*t3-x5 = tm35
paddsw (xmm1, xmm3); // x3+x5*t3 = tp35
pmulhw (xmm4, xmm7); // x7*t1
pmulhw (xmm2, xmm6); // x1*t1
paddsw (xmm4, xmm6); // x1+t1*x7 = tp17
psubsw (xmm2, xmm7); // x1*t1-x7 = tm17
movdqa (xmm3, sqrt2);
movdqa (xmm7, xmm4 );
movdqa (xmm6, xmm2 );
psubsw (xmm4, xmm1 ); // tp17-tp35 = t1
psubsw (xmm2, xmm0 ); // tm17-tm35 = b3
paddsw (xmm1, xmm7 ); // tp17+tp35 = b0
paddsw (xmm0, xmm6 ); // tm17+tm35 = t2
// xmm1 = b0, xmm2 = b3. preserved
movdqa (xmm6, xmm4 );
psubsw (xmm4, xmm0 ); // t1-t2
paddsw (xmm0, xmm6 ); // t1+t2
pmulhw (xmm4, xmm3 ); // (t1-t2)/(2.sqrt2)
pmulhw (xmm0, xmm3 ); // (t1+t2)/(2.sqrt2)
paddsw (xmm0, xmm0 ); // 2.(t1+t2) = b1
paddsw (xmm4, xmm4 ); // 2.(t1-t2) = b2
movdqa (xmm7, tan2 ); // t2
movdqa (xmm3, src+2*16); // x2
movdqa (xmm6, src+6*16); // x6
movdqa (xmm5, xmm7 ); // t2
pmulhw (xmm7, xmm6 ); // x6*t2
pmulhw (xmm5, xmm3 ); // x2*t2
paddsw (xmm7, xmm3 ); // x2+x6*t2 = tp26
psubsw (xmm5, xmm6 ); // x2*t2-x6 = tm26
// use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2
movdqa (xmm3, src+0*16); // x0
movdqa (xmm6, src+4*16); // x4
// we spill 1 reg to perform safe butterflies
movdqa( src , xmm2);
movdqa( xmm2, xmm3 );
psubsw( xmm3, xmm6 ); // x0-x4 = tm04
paddsw( xmm6, xmm2 ); // x0+x4 = tp04
movdqa( xmm2, xmm6 );
psubsw( xmm6, xmm7 );
paddsw( xmm7, xmm2 );
movdqa( xmm2, xmm3 );
psubsw( xmm3, xmm5 );
paddsw( xmm5, xmm2 );
movdqa( xmm2, xmm5 );
psubsw( xmm5, xmm0 );
paddsw( xmm0, xmm2 );
movdqa( xmm2, xmm3 );
psubsw( xmm3, xmm4 );
paddsw( xmm4, xmm2 );
movdqa( xmm2, src );
psraw ( xmm5, 6); // out6
psraw ( xmm3, 6); // out5
psraw ( xmm0, 6); // out1
psraw ( xmm4, 6); // out2
movdqa( src+6*16, xmm5);
movdqa( src+5*16, xmm3);
movdqa( src+1*16, xmm0);
movdqa( src+2*16, xmm4);
// reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3
movdqa( xmm0, xmm7);
movdqa( xmm4, xmm6);
psubsw( xmm7, xmm1); // a0-b0
psubsw( xmm6, xmm2); // a3-b3
paddsw( xmm1, xmm0); // a0+b0
paddsw( xmm2, xmm4); // a3+b3
psraw ( xmm1, 6 ); // out0
psraw ( xmm7, 6 ); // out7
psraw ( xmm2, 6 ); // out3
psraw ( xmm6, 6 ); // out4
// combine result
movdqa( src+0*16, xmm1);
movdqa( src+3*16, xmm2);
movdqa( src+4*16, xmm6);
movdqa( src+7*16, xmm7);
};
};
unsigned char *ecx=(unsigned char*)block;
__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
Tidct::iMTX_MULT(ecx,0, (const unsigned char*)iTab1, (const unsigned char*)Idct_Rnd0, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,1, (const unsigned char*)iTab2, (const unsigned char*)Idct_Rnd1, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,2, (const unsigned char*)iTab3, (const unsigned char*)Idct_Rnd2, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,3, (const unsigned char*)iTab4, (const unsigned char*)Idct_Rnd3, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,4, (const unsigned char*)iTab1, (const unsigned char*)Idct_Rnd4, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,5, (const unsigned char*)iTab4, (const unsigned char*)Idct_Rnd5, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,6, (const unsigned char*)iTab3, (const unsigned char*)Idct_Rnd6, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iMTX_MULT(ecx,7, (const unsigned char*)iTab2, (const unsigned char*)Idct_Rnd7, 11,xmm0,xmm4,xmm5,xmm6,xmm7);
Tidct::iLLM_PASS(ecx,xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7);
}
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -