📄 skl_dct.c
字号:
/******************************************************** * Some code. Copyright (C) 2003 by Pascal Massimino. * * All Rights Reserved. (http://skal.planet-d.net) * * For Educational/Academic use ONLY. * ********************************************************//* * skl_dct.cpp * * "Fast and precise" LLM implementation of FDCT/IDCT, where * rotations are decomposed using: * tmp = (x+y).cos t * x' = tmp + y.(sin t - cos t) * y' = tmp - x.(sin t + cos t) * * See details at http://skl.planet-d.net/coding/dct.html * and at the end of this file... * * Reference (e.g.): * Loeffler C., Ligtenberg A., and Moschytz C.S.: * Practical Fast 1D DCT Algorithm with Eleven Multiplications, * Proc. ICASSP 1989, 988-991. * * IEEE-1180-like error specs for FDCT: * Peak error: 1.0000 * Peak MSE: 0.0340 * Overall MSE: 0.0200 * Peak ME: 0.0191 * Overall ME: -0.0033 * * error specs for IDCT: * Peak error: 1.0000 * Peak MSE: 0.0065 * Overall MSE: 0.0051 * Peak ME: 0.0015 * Overall ME: 0.0000 * ********************************************************/#define TYPE short // a priori: 16b input // C-versionextern void Skl_IDct16_C( TYPE *In );extern void Skl_Dct16_C( TYPE *In ); // SSE-versionextern void Skl_IDct16_SSE( TYPE *In );extern void Skl_Dct16_SSE( TYPE *In ); // MMX-versionextern void Skl_IDct16_MMX( TYPE *In );extern void Skl_Dct16_MMX( TYPE *In );//////////////////////////////////////////////////////////#define LOAD_BUTF(m1, m2, a, b, tmp, S) \ (m1) = (S)[(a)] + (S)[(b)]; \ (m2) = (S)[(a)] - (S)[(b)]#define BUTF(a, b, tmp) \ (tmp) = (a)+(b); \ (b) = (a)-(b); \ (a) = (tmp)#define ROTATE(m1,m2,c,k1,k2,tmp,Fix,Rnd) \ (tmp) = ( (m1) + (m2) )*(c); \ (m1) *= k1; \ (m2) *= k2; \ (tmp) += (Rnd); \ (m1) = ((m1)+(tmp))>>Fix; \ (m2) = ((m2)+(tmp))>>Fix;#define ROTATE2(m1,m2,c,k1,k2,tmp) \ (tmp) = ( (m1) + (m2) )*(c); \ (m1) *= k1; \ (m2) *= k2; \ (m1) = (m1)+(tmp); \ (m2) = (m2)+(tmp);#define ROTATE0(m1,m2,c,k1,k2,tmp) \ (m1) = ( (m2) )*(c); \ (m2) = (m2)*k2+(m1);#define SHIFTL(x,n) ((x)<<(n))#define SHIFTR(x, n) ((x)>>(n))#define HALF(n) (1<<((n)-1))#define IPASS 3#define FPASS 2#define FIX 16#if 1#define ROT6_C 35468#define ROT6_SmC 50159#define ROT6_SpC 121095#define ROT17_C 77062#define ROT17_SmC 25571#define ROT17_SpC 128553#define ROT37_C 58981#define ROT37_SmC 98391#define ROT37_SpC 19571#define ROT13_C 167963#define ROT13_SmC 134553#define ROT13_SpC 201373#else#define FX(x) ( (int)floor((x)*(1<<FIX) + .5 ) )static const double c1 = cos(1.*M_PI/16);static const double c2 = cos(2.*M_PI/16);static const double c3 = cos(3.*M_PI/16);static const double c4 = cos(4.*M_PI/16);static const double c5 = cos(5.*M_PI/16);static const double c6 = cos(6.*M_PI/16);static const double c7 = cos(7.*M_PI/16);static const int ROT6_C = FX(c2-c6); // 0.541static const int ROT6_SmC = FX(2*c6); // 0.765static const int ROT6_SpC = FX(2*c2); // 1.847static const int ROT17_C = FX(c1+c7); // 1.175static const int ROT17_SmC = FX(2*c7); // 0.390static const int ROT17_SpC = FX(2*c1); // 1.961static const int ROT37_C = FX((c3-c7)/c4); // 0.899static const int ROT37_SmC = FX(2*(c5+c7)); // 1.501static const int ROT37_SpC = FX(2*(c1-c3)); // 0.298static const int ROT13_C = FX((c1+c3)/c4); // 2.562static const int ROT13_SmC = FX(2*(c3+c7)); // 2.053static const int ROT13_SpC = FX(2*(c1+c5)); // 3.072#endif//////////////////////////////////////////////////////////void Skl_Dct16_C( TYPE *In ){ TYPE *pIn; int i; pIn = In; for(i=8; i>0; --i) { int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; // odd LOAD_BUTF(mm1,mm6, 1, 6, mm0, pIn); LOAD_BUTF(mm2,mm5, 2, 5, mm0, pIn); LOAD_BUTF(mm3,mm4, 3, 4, mm0, pIn); LOAD_BUTF(mm0,mm7, 0, 7, Spill, pIn); BUTF(mm1, mm2, Spill); BUTF(mm0, mm3, Spill); ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, FIX-FPASS, HALF(FIX-FPASS)); pIn[2] = mm3; pIn[6] = mm2; BUTF(mm0, mm1, Spill); pIn[0] = SHIFTL(mm0, FPASS); pIn[4] = SHIFTL(mm1, FPASS); // even mm3 = mm5 + mm7; mm2 = mm4 + mm6; ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS)); ROTATE(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0, FIX-FPASS, HALF(FIX-FPASS)); mm7 += mm3; mm4 += mm2; pIn[1] = mm7; pIn[7] = mm4; ROTATE(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0, FIX-FPASS, HALF(FIX-FPASS)); mm5 += mm3; mm6 += mm2; pIn[3] = mm6; pIn[5] = mm5; pIn += 8; } pIn = In; for(i=8; i>0; --i) { int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; // odd LOAD_BUTF(mm1,mm6, 1*8, 6*8, mm0, pIn); LOAD_BUTF(mm2,mm5, 2*8, 5*8, mm0, pIn); BUTF(mm1, mm2, mm0); LOAD_BUTF(mm3,mm4, 3*8, 4*8, mm0, pIn); LOAD_BUTF(mm0,mm7, 0*8, 7*8, Spill, pIn); BUTF(mm0, mm3, Spill); ROTATE(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, Spill, 0, HALF(FIX+FPASS+3)); pIn[2*8] = (TYPE)SHIFTR(mm3,FIX+FPASS+3); pIn[6*8] = (TYPE)SHIFTR(mm2,FIX+FPASS+3); mm0 += HALF(FPASS+3) - 1; BUTF(mm0, mm1, Spill); pIn[0*8] = (TYPE)SHIFTR(mm0, FPASS+3); pIn[4*8] = (TYPE)SHIFTR(mm1, FPASS+3); // even mm3 = mm5 + mm7; mm2 = mm4 + mm6; ROTATE(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm0, 0, HALF(FIX+FPASS+3)); ROTATE2(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm0); mm7 += mm3; mm4 += mm2; pIn[7*8] = (TYPE)SHIFTR(mm4,FIX+FPASS+3); pIn[1*8] = (TYPE)SHIFTR(mm7,FIX+FPASS+3); ROTATE2(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm0); mm5 += mm3; mm6 += mm2; pIn[5*8] = (TYPE)SHIFTR(mm5,FIX+FPASS+3); pIn[3*8] = (TYPE)SHIFTR(mm6,FIX+FPASS+3); pIn++; }}//////////////////////////////////////////////////////////void Skl_IDct16_C( TYPE *In ){ TYPE *pIn; int i; pIn = In; for (i=8; i>0; --i) { int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; // odd mm4 = (int)pIn[7]; mm5 = (int)pIn[5]; mm6 = (int)pIn[3]; mm7 = (int)pIn[1]; mm2 = mm4 + mm6; mm3 = mm5 + mm7; ROTATE2(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm1); ROTATE2(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm1); ROTATE2(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm1); mm4 += mm2; mm5 += mm3; mm6 += mm2; mm7 += mm3; // even mm3 = (int)pIn[2]; mm2 = (int)pIn[6]; ROTATE2(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, mm1); LOAD_BUTF(mm0, mm1, 0, 4, Spill, pIn); mm0 = SHIFTL(mm0, FIX) + HALF(FIX-IPASS); mm1 = SHIFTL(mm1, FIX) + HALF(FIX-IPASS); BUTF(mm0, mm3, Spill); BUTF(mm1, mm2, Spill); BUTF(mm0, mm7, Spill); pIn[0] = SHIFTR(mm0, FIX-IPASS); pIn[7] = SHIFTR(mm7, FIX-IPASS); BUTF(mm1, mm6, mm0); pIn[1] = SHIFTR(mm1, FIX-IPASS); pIn[6] = SHIFTR(mm6, FIX-IPASS); BUTF(mm2, mm5, mm0); pIn[2] = SHIFTR(mm2, FIX-IPASS); pIn[5] = SHIFTR(mm5, FIX-IPASS); BUTF(mm3, mm4, mm0); pIn[3] = SHIFTR(mm3, FIX-IPASS); pIn[4] = SHIFTR(mm4, FIX-IPASS); pIn += 8; } pIn = In; for (i=8; i>0; --i) { int mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, Spill; // odd mm4 = (int)pIn[7*8]; mm5 = (int)pIn[5*8]; mm6 = (int)pIn[3*8]; mm7 = (int)pIn[1*8]; mm2 = mm4 + mm6; mm3 = mm5 + mm7; ROTATE2(mm2, mm3, ROT17_C, -ROT17_SpC, -ROT17_SmC, mm1); ROTATE2(mm4, mm7, -ROT37_C, ROT37_SpC, ROT37_SmC, mm1); ROTATE2(mm5, mm6, -ROT13_C, ROT13_SmC, ROT13_SpC, mm1); mm4 += mm2; mm5 += mm3; mm6 += mm2; mm7 += mm3; // even mm3 = (int)pIn[2*8]; mm2 = (int)pIn[6*8]; ROTATE2(mm3, mm2, ROT6_C, ROT6_SmC, -ROT6_SpC, mm1); LOAD_BUTF(mm0, mm1, 0*8, 4*8, Spill, pIn); mm0 = SHIFTL(mm0, FIX) + HALF(FIX+IPASS+3); mm1 = SHIFTL(mm1, FIX) + HALF(FIX+IPASS+3); BUTF(mm0, mm3, Spill); BUTF(mm1, mm2, Spill); BUTF(mm0, mm7, Spill); pIn[8*0] = (TYPE) SHIFTR(mm0, FIX+IPASS+3); pIn[8*7] = (TYPE) SHIFTR(mm7, FIX+IPASS+3); BUTF(mm1, mm6, mm0); pIn[8*1] = (TYPE) SHIFTR(mm1, FIX+IPASS+3); pIn[8*6] = (TYPE) SHIFTR(mm6, FIX+IPASS+3); BUTF(mm2, mm5, mm0); pIn[8*2] = (TYPE) SHIFTR(mm2, FIX+IPASS+3); pIn[8*5] = (TYPE) SHIFTR(mm5, FIX+IPASS+3); BUTF(mm3, mm4, mm0); pIn[8*3] = (TYPE) SHIFTR(mm3, FIX+IPASS+3); pIn[8*4] = (TYPE) SHIFTR(mm4, FIX+IPASS+3); pIn++; }}#undef FIX#undef FPASS#undef IPASS#undef BUTF#undef LOAD_BUTF#undef ROTATE#undef ROTATE2#undef SHIFTL#undef SHIFTR#undef TYPE//////////////////////////////////////////////////////////// NASM source///////////////////////////////////////////////////////////*; [BITS 32];//////////////////////////////////////////////////////////////////////;; -=FDCT=-;; Vertical pass is an implementation of the scheme:; Loeffler C., Ligtenberg A., and Moschytz C.S.:; Practical Fast 1D DCT Algorithm with Eleven Multiplications,; Proc. ICASSP 1989, 988-991.;; Horizontal pass is a double 4x4 vector/matrix multiplication,; (see also Intel's Application Note 922:; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm; Copyright (C) 1999 Intel Corporation); ; Notes:; * tan(3pi/16) is greater than 0.5, and would use the; sign bit when turned into 16b fixed-point precision. So,; we use the trick: x*tan3 = x*(tan3-1)+x; ; * There's only one SSE-specific instruction (pshufw).;; * There's still 1 or 2 ticks to save in fLLM_PASS, but; I prefer having a readable code, instead of a tightly; scheduled one...;; * Quantization stage (as well as pre-transposition for the; idct way back) can be included in the fTab* constants; (with induced loss of precision, somehow);; * Some more details at: http://skal.planet-d.net/coding/dct.html;;;//////////////////////////////////////////////////////////////////////;; == Mean square errors ==; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.000 [0.001]; 0.035 0.029 0.032 0.032 0.031 0.032 0.034 0.035 [0.032]; 0.026 0.028 0.027 0.027 0.025 0.028 0.028 0.025 [0.027]; 0.037 0.032 0.031 0.030 0.028 0.029 0.026 0.031 [0.030]; 0.000 0.001 0.001 0.002 0.000 0.002 0.001 0.001 [0.001]; 0.025 0.024 0.022 0.022 0.022 0.022 0.023 0.023 [0.023]; 0.026 0.028 0.025 0.028 0.030 0.025 0.026 0.027 [0.027]; 0.021 0.020 0.020 0.022 0.020 0.022 0.017 0.019 [0.020]; ; == Abs Mean errors ==; 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 [0.000]; 0.020 0.001 0.003 0.003 0.000 0.004 0.002 0.003 [0.002]; 0.000 0.001 0.001 0.001 0.001 0.004 0.000 0.000 [0.000]; 0.027 0.001 0.000 0.002 0.002 0.002 0.001 0.000 [0.003]; 0.000 0.000 0.000 0.000 0.000 0.001 0.000 0.001 [-0.000]; 0.001 0.003 0.001 0.001 0.002 0.001 0.000 0.000 [-0.000]; 0.000 0.002 0.002 0.001 0.001 0.002 0.001 0.000 [-0.000]; 0.000 0.002 0.001 0.002 0.001 0.002 0.001 0.001 [-0.000];; =========================; Peak error: 1.0000; Peak MSE: 0.0365; Overall MSE: 0.0201; Peak ME: 0.0265; Overall ME: 0.0006;;//////////////////////////////////////////////////////////////////////;; -=IDCT=-;; A little slower than fdct, because the final stages (butterflies and; descaling) require some unpairable shifting and packing, all on; the same CPU unit.;; THIS IDCT IS NOT IEEE-COMPLIANT: IT WILL FAIL THE [-300,300]; INPUT RANGE TEST (because of overflow). But the [-256,255] one; is OK, and I'm fine with it (for now;);; == Mean square errors ==
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -