📄 skl_dct_aan.cpp
字号:
/* * skl_dct_AAN.cpp * * AAN implementation of Forward Discrete Cosine Transform * designed for [0..255] input only. * * See: * Arai Y., Agui T., Nakajima M.: A fast DCT-SQ Scheme for Images, * Trans IEICE #71 (1988), pp 1095-1097. * * (C) 2002 Pascal Massimino * details at http://skal.planet-d.net/coding/dct.html ********************************************************/#include <math.h>#include <stdio.h>typedef short SKL_INT16;typedef long SKL_INT32;#define IN_TYPE SKL_INT16#define OUT_TYPE SKL_INT16#define TMP_TYPE SKL_INT32//////////////////////////////////////////////////////////#define SHIFTL(m,n) ((m)<<(n))#define FX(x,n) ((IN_TYPE)floor((x)*(1<<(n)) + .5))#define HALF(n) (1<<((n)-1))#define PMULHW(x, y) ( (IN_TYPE)( ((TMP_TYPE)(x)*(TMP_TYPE)(y))>>16 ) )#define STORE(a, m) In[(a)*8] = (OUT_TYPE)(m);#define LOAD(m, a) (m) = (IN_TYPE) In[(a)*8]#define LOAD_BUTF(m1, m2, a, b) \ (m1) = (IN_TYPE)In[(a)*8] - (IN_TYPE)In[(b)*8]; \ (m2) = (IN_TYPE)In[(a)*8] + (IN_TYPE)In[(b)*8]#define BUTF(a, b, tmp) \ (tmp) = (a)+(b); \ (a) = (a)-(b); \ (b) = (tmp)#define BUTFSAFE(a, b, tmp) \ (tmp) = (IN_TYPE)( ((b)/2+(a)/2) ); \ (a) = (IN_TYPE)( ((a)/2-(b)/2) ); \ (b) = (tmp)//////////////////////////////////////////////////////////static const double c2 = cos(2.*M_PI/16); // 0.70710678118654757static const double c4 = cos(4.*M_PI/16); // 0.70710678118654757static const double c6 = cos(6.*M_PI/16); // 0.38268343236508984static const double d6 = c6/c4; // = c2-c6 = 0.54119610014619701static const double d2 = c2/c4; // = c2+d6 = 1.3065629648763764static const double d1 = 1./cos(1.*M_PI/16); // 1.0195911582083184static const double e1 = 1./sin(1.*M_PI/16); // 5.1258308954830127static const double d3 = 1./cos(3.*M_PI/16); // 1.2026897738700906static const double e3 = 1./sin(3.*M_PI/16); // 1.7999524462728316static const double d4 = 1./cos(4.*M_PI/16); // 1.4142135623730949static const IN_TYPE D2 = FX( d2-1., 16 ); // 1.3065629648763764static const IN_TYPE D6 = FX( d6-.5, 17 ); // 0.54119610014619701static const IN_TYPE C4 = FX( c4-.5, 17 ); // 0.70710678118654757static const IN_TYPE C6 = FX( c6, 16 ); // 0.38268343236508984static const IN_TYPE uC4 = FX( cos(4.*M_PI/16)-.5, 16 );#define PASSB 2static void DCT_FWD_PASS1(OUT_TYPE *In){ IN_TYPE mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; IN_TYPE Spill; LOAD_BUTF(mm7, mm0, 0, 7); LOAD_BUTF(mm4, mm3, 3, 4); LOAD_BUTF(mm6, mm1, 1, 6); LOAD_BUTF(mm5, mm2, 2, 5); BUTF( mm0, mm3, Spill ); BUTF( mm1, mm2, Spill ); BUTF( mm3, mm2, Spill ); mm2 = SHIFTL(mm2, PASSB); mm3 = SHIFTL(mm3, PASSB); mm0 = SHIFTL(mm0, PASSB); mm1 = SHIFTL(mm1, PASSB); STORE(0, mm2); STORE(4, mm3); // Rot6 mm1 = mm0+mm1; mm1 = PMULHW(mm1, uC4) + (mm1>>1); BUTF( mm0, mm1, Spill); mm1 |= 1; STORE(2, mm1); STORE(6, mm0); // Rot1/3 mm4 = SHIFTL(mm4, PASSB); mm5 = SHIFTL(mm5, PASSB); mm6 = SHIFTL(mm6, PASSB); mm7 = SHIFTL(mm7, PASSB); mm4 += mm5; mm5 += mm6; mm6 += mm7; mm0 = mm6 - mm4; mm4 = PMULHW( mm4, D6 ) + mm4; mm5 = PMULHW( mm5, C4 ) + mm5; mm6 = PMULHW( mm6, D2 ) + mm6; mm0 = PMULHW( mm0, C6 ); mm4 = mm4 >> 1; mm5 = (mm5+1) >> 1; mm4 = mm4 - mm0; mm6 = mm6 - mm0; BUTF( mm7, mm5, Spill ); BUTF( mm7, mm4, Spill ); BUTF( mm5, mm6, Spill ); STORE(1, mm6); STORE(3, mm7); STORE(5, mm4); STORE(7, mm5);}static void DCT_FWD_PASS2(OUT_TYPE *In){ IN_TYPE mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; IN_TYPE Spill; LOAD_BUTF(mm7, mm0, 0, 7); LOAD_BUTF(mm4, mm3, 3, 4); LOAD_BUTF(mm6, mm1, 1, 6); LOAD_BUTF(mm5, mm2, 2, 5); BUTF( mm0, mm3, Spill ); BUTF( mm1, mm2, Spill ); BUTFSAFE( mm3, mm2, Spill ); STORE(0, mm2); STORE(4, mm3); // Rot6 mm1 = mm0+mm1; mm1 = PMULHW(mm1, uC4) + (mm1>>1); BUTF( mm0, mm1, Spill); STORE(2, mm1); STORE(6, mm0); // Rot1/3 mm4 += mm5; mm5 += mm6; mm6 += mm7; mm0 = mm6 - mm4; mm4 = PMULHW( mm4, D6 ) + mm4; mm5 = PMULHW( mm5, C4 ) + mm5; mm6 = PMULHW( mm6, D2 ) + mm6; mm0 = PMULHW( mm0, C6 ); mm4 >>= 1; mm5 >>= 1; mm4 = mm4 - mm0; mm6 = mm6 - mm0; mm5 |= 1; BUTF( mm7, mm5, Spill ); BUTF( mm7, mm4, Spill ); BUTF( mm5, mm6, Spill ); STORE(1, mm6); STORE(3, mm7); STORE(5, mm4); STORE(7, mm5);}//////////////////////////////////////////////////////////static IN_TYPE Final[64] = {0};static double Pass_Scale[8] = { d4, d1, d6, d3, d4, e3, d2, e1 };static int Final_Shift[64] = { 13, 13, 14, 13, 13, 13, 14, 13, 12, 12, 13, 12, 12, 12, 13, 12, 13, 13, 14, 13, 13, 13, 14, 13, 12, 12, 13, 12, 12, 12, 13, 12, 13, 13, 14, 13, 13, 13, 14, 13, 12, 12, 13, 12, 12, 12, 13, 12, 13, 13, 14, 13, 13, 13, 14, 13, 12, 12, 13, 12, 12, 12, 13, 12};static int Final_Offset[64] = { -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};static void Init_Cst() { for(int k=0; k<64; ++k) { double Cst = Pass_Scale[k/8]*Pass_Scale[k%8]; Final[k] = (IN_TYPE)FX(Cst, Final_Shift[k]-PASSB)+Final_Offset[k]; }// Print_Cst();}static void Print_Cst() { printf( "Mults dw" ); for(int k=0; k<64; ++k) { printf( " 0x%.4x,", Final[k] ); if ((k&7)==7) if (k!=63) printf( "\n dw"); else printf("\n"); } printf( "Halves dw" ); for(int l=0; l<64; ++l) { IN_TYPE M = Final[l]; IN_TYPE H = HALF(16)/M; printf( " 0x%.4x,", H ); if ((l&7)==7) if (l!=63) printf( "\n dw"); else printf("\n"); }}///////////////////////////////////////////////////////////*IEEE1180-like Errors specs:Peak error: 2.0000Peak MSE: 0.2821Overall MSE: 0.0675Peak ME: 0.1910Overall ME: -0.0183*/static void TRANSPOSE( OUT_TYPE *In ){ for(int i=0; i<8; ++i) for(int j=i+1; j<8; ++j) { SKL_INT16 Tmp = In[i+8*j]; In[i+8*j] = In[j+8*i]; In[j+8*i] = Tmp; }}static void FINISH(OUT_TYPE *In){ if (Final[0]==0) Init_Cst(); for(int i=0; i<64; ++i) { IN_TYPE M = Final[i]; IN_TYPE H = HALF(16)/M; IN_TYPE C = (IN_TYPE)In[i]; In[i] = (OUT_TYPE)PMULHW( C+H, M ); }}void Skl_Dct16_AAN1_C( OUT_TYPE *In ){ int i; TRANSPOSE(In); for(i=0; i<8; ++i) DCT_FWD_PASS1(In+i); TRANSPOSE(In); for(i=0; i<8; ++i) DCT_FWD_PASS2(In+i); FINISH(In);}//////////////////////////////////////////////////////////// NASM source///////////////////////////////////////////////////////////*; [BITS 32];//////////////////////////////////////////////////////////////////////SECTION .dataalign 16Mults dw 0x0fff, 0x0b89, 0x0c3f, 0x0d9b, 0x0fff, 0x145d, 0x1d90, 0x39fe, dw 0x05c5, 0x0429, 0x046a, 0x04e8, 0x05c5, 0x0757, 0x0aa8, 0x14e8, dw 0x061f, 0x046a, 0x04b0, 0x0535, 0x061f, 0x07cb, 0x0b50, 0x1631, dw 0x06ce, 0x04e8, 0x0535, 0x05c9, 0x06ce, 0x08a9, 0x0c92, 0x18a9, dw 0x0fff, 0x0b89, 0x0c3f, 0x0d9b, 0x0fff, 0x145d, 0x1d90, 0x39fe, dw 0x0a2f, 0x0757, 0x07cb, 0x08a9, 0x0a2f, 0x0cf6, 0x12d0, 0x24e8, dw 0x0ec8, 0x0aa8, 0x0b50, 0x0c92, 0x0ec8, 0x12d0, 0x1b50, 0x3594, dw 0x1cff, 0x14e8, 0x1631, 0x18a9, 0x1cff, 0x24e8, 0x3594, 0x6919Halves dw 0x0008, 0x000b, 0x000a, 0x0009, 0x0008, 0x0006, 0x0004, 0x0002, dw 0x0016, 0x001e, 0x001c, 0x001a, 0x0016, 0x0011, 0x000c, 0x0006, dw 0x0014, 0x001c, 0x001b, 0x0018, 0x0014, 0x0010, 0x000b, 0x0005, dw 0x0012, 0x001a, 0x0018, 0x0016, 0x0012, 0x000e, 0x000a, 0x0005, dw 0x0008, 0x000b, 0x000a, 0x0009, 0x0008, 0x0006, 0x0004, 0x0002, dw 0x000c, 0x0011, 0x0010, 0x000e, 0x000c, 0x0009, 0x0006, 0x0003, dw 0x0008, 0x000c, 0x000b, 0x000a, 0x0008, 0x0006, 0x0004, 0x0002, dw 0x0004, 0x0006, 0x0005, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001D2 times 8 dw 0x4e7bD6 times 8 dw 0x1518C4 times 8 dw 0x6a0aC6 times 8 dw 0x61f8uC4 times 8 dw 0x3505Rounder times 8 dw 0x0001Spill times 8 dw 0;//////////////////////////////////////////////////////////////////////SECTION .text;//////////////////////////////////////////////////////////////////////; in-place 8x8 TRANSPOSE%macro TRANSPOSE_4x4_MMX 1 ; #1: Ptr. uses m0,m1,m2,m3, + m4,m5 movq mm0, [%1+0*16] ; 00 01 02 03 movq mm4, [%1+2*16] ; 20 21 22 23 movq mm1, [%1+1*16] ; 10 11 12 13 movq mm3, [%1+3*16] ; 30 31 32 33 movq mm2, mm0 movq mm5, mm4 punpcklwd mm0, mm1 ; mm0 = 00 10 01 11 punpckhwd mm2, mm1 ; mm2 = 02 12 03 13 punpcklwd mm4, mm3 ; mm4 = 20 30 21 31 punpckhwd mm5, mm3 ; mm5 = 22 32 23 33 movq mm1, mm0 punpckhdq mm1, mm4 ; mm1 = 01 11 21 31 movq mm3, mm2 punpckldq mm0, mm4 ; mm0 = 00 10 20 30 movq [%1+1*16], mm1 punpckldq mm2, mm5 ; mm2 = 02 12 22 32 movq [%1+0*16], mm0 punpckhdq mm3, mm5 ; mm3 = 03 13 23 33 movq [%1+2*16], mm2 movq [%1+3*16], mm3%endmacro%macro TRANSPOSE_8x4_MMX 1 TRANSPOSE_4x4_MMX %1 TRANSPOSE_4x4_MMX %1 +8%endmacro%macro TRANSPOSE_4x8_MMX 1 TRANSPOSE_4x4_MMX %1 TRANSPOSE_4x4_MMX %1 + 4*16%endmacro%macro MULT2 2 ; Ptr, i movq mm0, [%1 +%2*16 + 0] movq mm1, [%1 +%2*16 + 8] paddw mm0, [Halves+%2*16 + 0] paddw mm1, [Halves+%2*16 + 8] pmulhw mm0, [Mults+%2*16 + 0] pmulhw mm1, [Mults+%2*16 + 8] movq [%1 +%2*16 + 0], mm0 movq [%1 +%2*16 + 8], mm1%endmacro;//////////////////////////////////////////////////////////////////////%macro LOAD 3 ; m1, a, In ; performs: m1 = In[a] movq %1, [%3 + %2*8]%endmacro%macro LOAD_BUTF 5 ; m1, m2, a, b, In ; 'butterflied' load: m1 = In[a]-In[b], m2 = In[a]+In[b] movq %1, [%5 + %3*8] movq %2, [%5 + %4*8] psubw %1, %2 paddw %2, [%5 + %3*8]%endmacro%macro STORE 3 ; m1, a, Out movq [%3 + %2*8], %1%endmacro%macro LOAD2 3 ; m1, a, In ; performs: m1 = In[a] movq %1, [%3 + %2*16]%endmacro%macro LOAD2_BUTF 5 ; m1, m2, a, b, In ; 'butterflied' load: m1 = In[a]-m2, m2 = In[a]+m2 movq %1, [%5 + %3*16] movq %2, [%5 + %4*16] psubw %1, %2 paddw %2, [%5 + %3*16]%endmacro%macro STORE2 3 ; m1, a, Out movq [%3 + %2*16], %1%endmacro%macro BUTF 3 ; a, b, tmp ; performs butterfly: (a,b) -> (a-b, a+b) movq %3, %2 paddsw %2, %1 psubsw %1, %3%endmacro%macro BUTF_SAFE 3 ; a, b, tmp ; performs butterfly: (a,b) -> ( a/2-b/2, a/2+b/2 ) psraw %1, 1 psraw %2, 1 movq %3, %2 paddsw %2, %1 psubsw %1, %3%endmacro%macro DBL_BUTF 6 ; a, b, tmp, a', b', tmp' ; performs double butterfly on (a,b) / (a',b') movq %3, %2 movq %6, %5 paddw %2, %1 paddw %5, %4 psubw %1, %3 psubw %4, %6%endmacro%macro SHIFTL 2 psllw %1, %2%endmacro;//////////////////////////////////////////////////////////////////////%define PASSB 2%macro FOUR_COLS_FWD_PASS1 1 ; In ; stage 1 -even part- ; ~20c LOAD_BUTF mm7, mm0, 0, 7, %1 ; 0, 7 LOAD_BUTF mm4, mm3, 6, 1, %1 ; 3, 4 BUTF mm0, mm3, mm6 LOAD_BUTF mm6, mm1, 2, 5, %1 ; 1, 6 LOAD_BUTF mm5, mm2, 4, 3, %1 ; 2, 5 BUTF mm1, mm2, [Spill] BUTF mm3, mm2, [Spill] SHIFTL mm0, PASSB SHIFTL mm1, PASSB SHIFTL mm2, PASSB SHIFTL mm3, PASSB STORE mm2, 0, %1 ; ->0 ; preserve mm3 ; Rot6 paddw mm1, mm0 movq mm2, mm1 pmulhw mm1, [uC4] psraw mm2, 1 STORE mm3, 1, %1 ; ->4 ; leftover from previous stage paddw mm1, mm2 BUTF mm0, mm1, mm2 por mm1, [Rounder] STORE mm1, 4, %1 ; ->2 ; leftover from previous stage STORE mm0, 5, %1 ; ->6 ; leftover from previous stage ; preserve mm0, mm1 ; stage 2 -odd part- SHIFTL mm4, PASSB SHIFTL mm5, PASSB SHIFTL mm6, PASSB SHIFTL mm7, PASSB paddw mm4, mm5 paddw mm5, mm6 paddw mm6, mm7 movq mm2, mm6 psubw mm2, mm4 ; mm2 <=> mm0 movq mm0, mm4 movq mm1, mm5 pmulhw mm4, [D6] pmulhw mm5, [C4] paddw mm4, mm0 paddw mm5, mm1 psraw mm4, 1 paddw mm5, [Rounder] psraw mm5, 1 movq mm3, mm6 pmulhw mm6, [D2] paddw mm6, mm3 pmulhw mm2, [C6] psubw mm4, mm2 psubw mm6, mm2 BUTF mm7, mm5, mm2 DBL_BUTF mm7, mm4, mm0, mm5, mm6, mm1 STORE mm6, 2, %1 ; ->1 STORE mm7, 6, %1 ; ->3 STORE mm4, 3, %1 ; ->5 STORE mm5, 7, %1 ; ->7%endmacro%macro FOUR_COLS_FWD_PASS2 2 ; In, Scale offset ; stage 1 -even part- LOAD2_BUTF mm7, mm0, 0, 7, %1 ; <- 0, 7 LOAD2_BUTF mm4, mm3, 3, 4, %1 ; <- 3, 4 BUTF mm0, mm3, mm6 LOAD2_BUTF mm6, mm1, 1, 6, %1 ; <- 1,6 LOAD2_BUTF mm5, mm2, 2, 5, %1 ; <- 2,5 BUTF mm1, mm2, [Spill] BUTF_SAFE mm3, mm2, [Spill] ; overflow-prone paddw mm2, [Halves + 0*16 + %2] paddw mm3, [Halves + 4*16 + %2] pmulhw mm2, [Mults + 0*16 + %2] pmulhw mm3, [Mults + 4*16 + %2] STORE2 mm2, 0, %1 ; ->0 STORE2 mm3, 4, %1 ; ->4 ; leftover from previous stage ; Rot6 paddw mm1, mm0 movq mm2, mm1 psraw mm2, 1 pmulhw mm1, [uC4] paddw mm1, mm2 BUTF mm0, mm1, mm2 paddw mm1, [Halves + 2*16 + %2] paddw mm0, [Halves + 6*16 + %2] pmulhw mm1, [Mults + 2*16 + %2] pmulhw mm0, [Mults + 6*16 + %2] STORE2 mm1, 2, %1 ; ->2 ; leftover STORE2 mm0, 6, %1 ; ->6 ; preserve mm0, mm1 ; stage 2 -odd part- paddw mm4, mm5 paddw mm5, mm6 paddw mm6, mm7 movq mm2, mm6 psubw mm2, mm4 ; mm2 <=> mm0 movq mm0, mm4 movq mm1, mm5 pmulhw mm4, [D6] pmulhw mm5, [C4] movq mm3, mm6 pmulhw mm6, [D2] paddw mm4, mm0 paddw mm5, mm1 psraw mm4, 1 psraw mm5, 1 paddw mm6, mm3 pmulhw mm2, [C6] psubw mm4, mm2 psubw mm6, mm2 por mm5, [Rounder] BUTF mm7, mm5, mm2 DBL_BUTF mm7, mm4, mm0, mm5, mm6, mm1 paddw mm6, [Halves + 1*16 + %2] paddw mm7, [Halves + 3*16 + %2] pmulhw mm6, [Mults + 1*16 + %2] paddw mm4, [Halves + 5*16 + %2] pmulhw mm7, [Mults + 3*16 + %2] STORE2 mm6, 1, %1 ; ->1 paddw mm5, [Halves + 7*16 + %2] pmulhw mm4, [Mults + 5*16 + %2] STORE2 mm7, 3, %1 ; ->3 pmulhw mm5, [Mults + 7*16 + %2] STORE2 mm4, 5, %1 ; ->5 STORE2 mm5, 7, %1 ; ->7%endmacroSkl_Dct16_AAN: ; interlaced final-scale: total: 293c mov ecx,[esp+4] ; In ; 4 transposes: ~80c ; PASS1 x 2: ~100c ; PASS2 x 2: ~110c TRANSPOSE_8x4_MMX ecx FOUR_COLS_FWD_PASS1 ecx TRANSPOSE_8x4_MMX ecx + 4*16 FOUR_COLS_FWD_PASS1 ecx+4*16 TRANSPOSE_4x8_MMX ecx FOUR_COLS_FWD_PASS2 ecx, 0 TRANSPOSE_4x8_MMX ecx+8 FOUR_COLS_FWD_PASS2 ecx+8, 8%if 0 ; this descaling phase could be fused with Quantizing stage MULT2 ecx, 0 MULT2 ecx, 1 MULT2 ecx, 2 MULT2 ecx, 3 MULT2 ecx, 4 MULT2 ecx, 5 MULT2 ecx, 6 MULT2 ecx, 7%endif ret;//////////////////////////////////////////////////////////////////////*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -